core: fix snapshot missing when recovery from crash (#23496)

It is because write known block only checks block and state without snapshot, which could lead to gap between newest snapshot and newest block state. However, new blocks which would cause snapshot to become fixed were ignored, since state was already known. 


Co-authored-by: Gary Rong <garyrong0905@gmail.com>
Co-authored-by: Martin Holst Swende <martin@swende.se>
This commit is contained in:
Ziyuan Zhong(仲梓源) 2021-11-01 21:09:36 +08:00 committed by GitHub
parent c2e64db3b1
commit c576fa153a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 182 additions and 10 deletions

@ -1436,11 +1436,10 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
// Peek the error for the first block to decide the directing import logic // Peek the error for the first block to decide the directing import logic
it := newInsertIterator(chain, results, bc.validator) it := newInsertIterator(chain, results, bc.validator)
block, err := it.next() block, err := it.next()
// Left-trim all the known blocks // Left-trim all the known blocks that don't need to build snapshot
if err == ErrKnownBlock { if bc.skipBlock(err, it) {
// First block (and state) is known // First block (and state) is known
// 1. We did a roll-back, and should now do a re-import // 1. We did a roll-back, and should now do a re-import
// 2. The block is stored as a sidechain, and is lying about it's stateroot, and passes a stateroot // 2. The block is stored as a sidechain, and is lying about it's stateroot, and passes a stateroot
@ -1451,7 +1450,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
localTd = bc.GetTd(current.Hash(), current.NumberU64()) localTd = bc.GetTd(current.Hash(), current.NumberU64())
externTd = bc.GetTd(block.ParentHash(), block.NumberU64()-1) // The first block can't be nil externTd = bc.GetTd(block.ParentHash(), block.NumberU64()-1) // The first block can't be nil
) )
for block != nil && err == ErrKnownBlock { for block != nil && bc.skipBlock(err, it) {
externTd = new(big.Int).Add(externTd, block.Difficulty()) externTd = new(big.Int).Add(externTd, block.Difficulty())
if localTd.Cmp(externTd) < 0 { if localTd.Cmp(externTd) < 0 {
break break
@ -1469,7 +1468,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
// When node runs a fast sync again, it can re-import a batch of known blocks via // When node runs a fast sync again, it can re-import a batch of known blocks via
// `insertChain` while a part of them have higher total difficulty than current // `insertChain` while a part of them have higher total difficulty than current
// head full block(new pivot point). // head full block(new pivot point).
for block != nil && err == ErrKnownBlock { for block != nil && bc.skipBlock(err, it) {
log.Debug("Writing previously known block", "number", block.Number(), "hash", block.Hash()) log.Debug("Writing previously known block", "number", block.Number(), "hash", block.Hash())
if err := bc.writeKnownBlock(block); err != nil { if err := bc.writeKnownBlock(block); err != nil {
return it.index, err return it.index, err
@ -1501,8 +1500,10 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
// If there are any still remaining, mark as ignored // If there are any still remaining, mark as ignored
return it.index, err return it.index, err
// Some other error occurred, abort // Some other error(except ErrKnownBlock) occurred, abort.
case err != nil: // ErrKnownBlock is allowed here since some known blocks
// still need re-execution to generate snapshots that are missing
case err != nil && !errors.Is(err, ErrKnownBlock):
bc.futureBlocks.Remove(block.Hash()) bc.futureBlocks.Remove(block.Hash())
stats.ignored += len(it.chain) stats.ignored += len(it.chain)
bc.reportBlock(block, nil, err) bc.reportBlock(block, nil, err)
@ -1520,7 +1521,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
} }
}() }()
for ; block != nil && err == nil || err == ErrKnownBlock; block, err = it.next() { for ; block != nil && err == nil || errors.Is(err, ErrKnownBlock); block, err = it.next() {
// If the chain is terminating, stop processing blocks // If the chain is terminating, stop processing blocks
if bc.insertStopped() { if bc.insertStopped() {
log.Debug("Abort during block processing") log.Debug("Abort during block processing")
@ -1535,8 +1536,9 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
// Clique blocks where they can share state among each other, so importing an // Clique blocks where they can share state among each other, so importing an
// older block might complete the state of the subsequent one. In this case, // older block might complete the state of the subsequent one. In this case,
// just skip the block (we already validated it once fully (and crashed), since // just skip the block (we already validated it once fully (and crashed), since
// its header and body was already in the database). // its header and body was already in the database). But if the corresponding
if err == ErrKnownBlock { // snapshot layer is missing, forcibly rerun the execution to build it.
if bc.skipBlock(err, it) {
logger := log.Debug logger := log.Debug
if bc.chainConfig.Clique == nil { if bc.chainConfig.Clique == nil {
logger = log.Warn logger = log.Warn
@ -2013,6 +2015,47 @@ func (bc *BlockChain) futureBlocksLoop() {
} }
} }
// skipBlock returns 'true', if the block being imported can be skipped over, meaning
// that the block does not need to be processed but can be considered already fully 'done'.
func (bc *BlockChain) skipBlock(err error, it *insertIterator) bool {
// We can only ever bypass processing if the only error returned by the validator
// is ErrKnownBlock, which means all checks passed, but we already have the block
// and state.
if !errors.Is(err, ErrKnownBlock) {
return false
}
// If we're not using snapshots, we can skip this, since we have both block
// and (trie-) state
if bc.snaps == nil {
return true
}
var (
header = it.current() // header can't be nil
parentRoot common.Hash
)
// If we also have the snapshot-state, we can skip the processing.
if bc.snaps.Snapshot(header.Root) != nil {
return true
}
// In this case, we have the trie-state but not snapshot-state. If the parent
// snapshot-state exists, we need to process this in order to not get a gap
// in the snapshot layers.
// Resolve parent block
if parent := it.previous(); parent != nil {
parentRoot = parent.Root
} else if parent = bc.GetHeaderByHash(header.ParentHash); parent != nil {
parentRoot = parent.Root
}
if parentRoot == (common.Hash{}) {
return false // Theoretically impossible case
}
// Parent is also missing snapshot: we can skip this. Otherwise process.
if bc.snaps.Snapshot(parentRoot) == nil {
return true
}
return false
}
// maintainTxIndex is responsible for the construction and deletion of the // maintainTxIndex is responsible for the construction and deletion of the
// transaction index. // transaction index.
// //

@ -150,6 +150,14 @@ func (it *insertIterator) previous() *types.Header {
return it.chain[it.index-1].Header() return it.chain[it.index-1].Header()
} }
// current returns the current header that is being processed, or nil.
func (it *insertIterator) current() *types.Header {
if it.index == -1 || it.index >= len(it.chain) {
return nil
}
return it.chain[it.index].Header()
}
// first returns the first block in the it. // first returns the first block in the it.
func (it *insertIterator) first() *types.Block { func (it *insertIterator) first() *types.Block {
return it.chain[0] return it.chain[0]

@ -1863,3 +1863,124 @@ func testRepair(t *testing.T, tt *rewindTest, snapshots bool) {
t.Errorf("Frozen block count mismatch: have %d, want %d", frozen, tt.expFrozen) t.Errorf("Frozen block count mismatch: have %d, want %d", frozen, tt.expFrozen)
} }
} }
// TestIssue23496 tests scenario described in https://github.com/ethereum/go-ethereum/pull/23496#issuecomment-926393893
// Credits to @zzyalbert for finding the issue.
//
// Local chain owns these blocks:
// G B1 B2 B3 B4
// B1: state committed
// B2: snapshot disk layer
// B3: state committed
// B4: head block
//
// Crash happens without fully persisting snapshot and in-memory states,
// chain rewinds itself to the B1 (skip B3 in order to recover snapshot)
// In this case the snapshot layer of B3 is not created because of existent
// state.
func TestIssue23496(t *testing.T) {
// It's hard to follow the test case, visualize the input
//log.Root().SetHandler(log.LvlFilterHandler(log.LvlTrace, log.StreamHandler(os.Stderr, log.TerminalFormat(true))))
// Create a temporary persistent database
datadir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatalf("Failed to create temporary datadir: %v", err)
}
os.RemoveAll(datadir)
db, err := rawdb.NewLevelDBDatabaseWithFreezer(datadir, 0, 0, datadir, "", false)
if err != nil {
t.Fatalf("Failed to create persistent database: %v", err)
}
defer db.Close() // Might double close, should be fine
// Initialize a fresh chain
var (
genesis = (&Genesis{BaseFee: big.NewInt(params.InitialBaseFee)}).MustCommit(db)
engine = ethash.NewFullFaker()
config = &CacheConfig{
TrieCleanLimit: 256,
TrieDirtyLimit: 256,
TrieTimeLimit: 5 * time.Minute,
SnapshotLimit: 256,
SnapshotWait: true,
}
)
chain, err := NewBlockChain(db, config, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
if err != nil {
t.Fatalf("Failed to create chain: %v", err)
}
blocks, _ := GenerateChain(params.TestChainConfig, genesis, engine, rawdb.NewMemoryDatabase(), 4, func(i int, b *BlockGen) {
b.SetCoinbase(common.Address{0x02})
b.SetDifficulty(big.NewInt(1000000))
})
// Insert block B1 and commit the state into disk
if _, err := chain.InsertChain(blocks[:1]); err != nil {
t.Fatalf("Failed to import canonical chain start: %v", err)
}
chain.stateCache.TrieDB().Commit(blocks[0].Root(), true, nil)
// Insert block B2 and commit the snapshot into disk
if _, err := chain.InsertChain(blocks[1:2]); err != nil {
t.Fatalf("Failed to import canonical chain start: %v", err)
}
if err := chain.snaps.Cap(blocks[1].Root(), 0); err != nil {
t.Fatalf("Failed to flatten snapshots: %v", err)
}
// Insert block B3 and commit the state into disk
if _, err := chain.InsertChain(blocks[2:3]); err != nil {
t.Fatalf("Failed to import canonical chain start: %v", err)
}
chain.stateCache.TrieDB().Commit(blocks[2].Root(), true, nil)
// Insert the remaining blocks
if _, err := chain.InsertChain(blocks[3:]); err != nil {
t.Fatalf("Failed to import canonical chain tail: %v", err)
}
// Pull the plug on the database, simulating a hard crash
db.Close()
// Start a new blockchain back up and see where the repair leads us
db, err = rawdb.NewLevelDBDatabaseWithFreezer(datadir, 0, 0, datadir, "", false)
if err != nil {
t.Fatalf("Failed to reopen persistent database: %v", err)
}
defer db.Close()
chain, err = NewBlockChain(db, nil, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
if err != nil {
t.Fatalf("Failed to recreate chain: %v", err)
}
defer chain.Stop()
if head := chain.CurrentHeader(); head.Number.Uint64() != uint64(4) {
t.Errorf("Head header mismatch: have %d, want %d", head.Number, 4)
}
if head := chain.CurrentFastBlock(); head.NumberU64() != uint64(4) {
t.Errorf("Head fast block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
}
if head := chain.CurrentBlock(); head.NumberU64() != uint64(1) {
t.Errorf("Head block mismatch: have %d, want %d", head.NumberU64(), uint64(1))
}
// Reinsert B2-B4
if _, err := chain.InsertChain(blocks[1:]); err != nil {
t.Fatalf("Failed to import canonical chain tail: %v", err)
}
if head := chain.CurrentHeader(); head.Number.Uint64() != uint64(4) {
t.Errorf("Head header mismatch: have %d, want %d", head.Number, 4)
}
if head := chain.CurrentFastBlock(); head.NumberU64() != uint64(4) {
t.Errorf("Head fast block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
}
if head := chain.CurrentBlock(); head.NumberU64() != uint64(4) {
t.Errorf("Head block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
}
if layer := chain.Snapshots().Snapshot(blocks[2].Root()); layer == nil {
t.Error("Failed to regenerate the snapshot of known state")
}
}