2015-07-07 03:54:22 +03:00
|
|
|
// Copyright 2015 The go-ethereum Authors
|
2015-07-22 19:48:40 +03:00
|
|
|
// This file is part of the go-ethereum library.
|
2015-07-07 03:54:22 +03:00
|
|
|
//
|
2015-07-23 19:35:11 +03:00
|
|
|
// The go-ethereum library is free software: you can redistribute it and/or modify
|
2015-07-07 03:54:22 +03:00
|
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
2015-07-22 19:48:40 +03:00
|
|
|
// The go-ethereum library is distributed in the hope that it will be useful,
|
2015-07-07 03:54:22 +03:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
2015-07-22 19:48:40 +03:00
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
2015-07-07 03:54:22 +03:00
|
|
|
// GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
2015-07-22 19:48:40 +03:00
|
|
|
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
2015-07-07 03:54:22 +03:00
|
|
|
|
2015-06-16 11:58:32 +03:00
|
|
|
// Package downloader contains the manual full chain synchronisation.
|
2015-04-12 13:38:25 +03:00
|
|
|
package downloader
|
|
|
|
|
|
|
|
import (
|
2015-04-18 02:10:32 +03:00
|
|
|
"errors"
|
2015-10-13 12:04:25 +03:00
|
|
|
"fmt"
|
2015-07-29 13:20:54 +03:00
|
|
|
"math/big"
|
2015-04-12 13:38:25 +03:00
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
|
|
|
"time"
|
|
|
|
|
2019-05-15 14:33:33 +03:00
|
|
|
"github.com/ethereum/go-ethereum"
|
2015-04-12 13:38:25 +03:00
|
|
|
"github.com/ethereum/go-ethereum/common"
|
2018-05-07 14:35:06 +03:00
|
|
|
"github.com/ethereum/go-ethereum/core/rawdb"
|
2021-04-29 17:33:45 +03:00
|
|
|
"github.com/ethereum/go-ethereum/core/state/snapshot"
|
2015-04-12 13:38:25 +03:00
|
|
|
"github.com/ethereum/go-ethereum/core/types"
|
2021-04-08 18:06:03 +03:00
|
|
|
"github.com/ethereum/go-ethereum/eth/protocols/eth"
|
2020-12-14 12:27:15 +03:00
|
|
|
"github.com/ethereum/go-ethereum/eth/protocols/snap"
|
2015-10-05 19:37:56 +03:00
|
|
|
"github.com/ethereum/go-ethereum/ethdb"
|
2015-05-15 01:43:00 +03:00
|
|
|
"github.com/ethereum/go-ethereum/event"
|
2017-02-22 15:10:07 +03:00
|
|
|
"github.com/ethereum/go-ethereum/log"
|
2016-05-13 13:12:13 +03:00
|
|
|
"github.com/ethereum/go-ethereum/params"
|
2019-05-13 15:28:01 +03:00
|
|
|
"github.com/ethereum/go-ethereum/trie"
|
2015-04-12 13:38:25 +03:00
|
|
|
)
|
|
|
|
|
2015-06-08 14:06:36 +03:00
|
|
|
var (
|
2015-09-28 19:27:31 +03:00
|
|
|
MaxBlockFetch = 128 // Amount of blocks to be fetched per retrieval request
|
|
|
|
MaxHeaderFetch = 192 // Amount of block headers to be fetched per retrieval request
|
2016-02-25 18:36:42 +02:00
|
|
|
MaxSkeletonSize = 128 // Number of header fetches to need for a skeleton assembly
|
2015-09-28 19:27:31 +03:00
|
|
|
MaxReceiptFetch = 256 // Amount of transaction receipts to allow fetching per request
|
|
|
|
|
2020-07-13 12:02:54 +03:00
|
|
|
maxQueuedHeaders = 32 * 1024 // [eth/62] Maximum number of headers to queue for import (DOS protection)
|
|
|
|
maxHeadersProcess = 2048 // Number of header download results to import at once into the chain
|
|
|
|
maxResultsProcess = 2048 // Number of content download results to import at once into the chain
|
|
|
|
fullMaxForkAncestry uint64 = params.FullImmutabilityThreshold // Maximum chain reorganisation (locally redeclared so tests can reduce it)
|
|
|
|
lightMaxForkAncestry uint64 = params.LightImmutabilityThreshold // Maximum chain reorganisation (locally redeclared so tests can reduce it)
|
2015-09-28 19:27:31 +03:00
|
|
|
|
2018-10-04 16:36:59 +03:00
|
|
|
reorgProtThreshold = 48 // Threshold number of recent blocks to disable mini reorg protection
|
|
|
|
reorgProtHeaderDelay = 2 // Number of headers to delay delivering to cover mini reorgs
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
fsHeaderCheckFrequency = 100 // Verification frequency of the downloaded headers during snap sync
|
2018-02-05 19:40:32 +03:00
|
|
|
fsHeaderSafetyNet = 2048 // Number of headers to discard in case a chain violation is detected
|
|
|
|
fsHeaderForceVerify = 24 // Number of headers to verify before and after the pivot to accept it
|
|
|
|
fsHeaderContCheck = 3 * time.Second // Time interval to check for header continuations during state download
|
2021-11-26 14:26:03 +03:00
|
|
|
fsMinFullBlocks = 64 // Number of blocks to retrieve fully even in snap sync
|
2015-05-15 13:14:46 +03:00
|
|
|
)
|
2015-04-19 14:30:34 +03:00
|
|
|
|
2015-05-15 13:14:46 +03:00
|
|
|
var (
|
2016-02-25 18:36:42 +02:00
|
|
|
errBusy = errors.New("busy")
|
|
|
|
errUnknownPeer = errors.New("peer is unknown or unhealthy")
|
|
|
|
errBadPeer = errors.New("action from bad peer ignored")
|
|
|
|
errStallingPeer = errors.New("peer is stalling")
|
2019-04-16 13:20:38 +03:00
|
|
|
errUnsyncedPeer = errors.New("unsynced peer")
|
2016-02-25 18:36:42 +02:00
|
|
|
errNoPeers = errors.New("no peers to keep download active")
|
|
|
|
errTimeout = errors.New("timeout")
|
|
|
|
errEmptyHeaderSet = errors.New("empty header set by peer")
|
|
|
|
errPeersUnavailable = errors.New("no peers available or all tried for download")
|
|
|
|
errInvalidAncestor = errors.New("retrieved ancestor is invalid")
|
|
|
|
errInvalidChain = errors.New("retrieved hash chain is invalid")
|
|
|
|
errInvalidBody = errors.New("retrieved block body is invalid")
|
|
|
|
errInvalidReceipt = errors.New("retrieved receipt is invalid")
|
|
|
|
errCancelStateFetch = errors.New("state data download canceled (requested)")
|
|
|
|
errCancelContentProcessing = errors.New("content processing canceled (requested)")
|
2019-06-05 15:00:46 +03:00
|
|
|
errCanceled = errors.New("syncing canceled (requested)")
|
2021-01-06 09:37:45 +03:00
|
|
|
errTooOld = errors.New("peer's protocol version too old")
|
2021-01-21 00:45:01 +03:00
|
|
|
errNoAncestorFound = errors.New("no common ancestor found")
|
2015-04-18 02:10:32 +03:00
|
|
|
)
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// peerDropFn is a callback type for dropping a peer detected as malicious.
|
|
|
|
type peerDropFn func(id string)
|
|
|
|
|
2015-04-12 13:38:25 +03:00
|
|
|
type Downloader struct {
|
2020-06-30 20:43:29 +03:00
|
|
|
mode uint32 // Synchronisation mode defining the strategy used (per sync cycle), use d.getMode() to get the SyncMode
|
2016-06-02 12:37:14 +03:00
|
|
|
mux *event.TypeMux // Event multiplexer to announce sync operation events
|
2015-05-15 01:43:00 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
checkpoint uint64 // Checkpoint block number to enforce head against (e.g. snap sync)
|
2019-04-16 13:20:38 +03:00
|
|
|
genesis uint64 // Genesis block number to limit sync to (e.g. light client CHT)
|
|
|
|
queue *queue // Scheduler for selecting the hashes to download
|
|
|
|
peers *peerSet // Set of active peers from which download can proceed
|
2019-05-13 15:28:01 +03:00
|
|
|
|
|
|
|
stateDB ethdb.Database // Database to state sync into (and deduplicate via)
|
2021-11-26 14:26:03 +03:00
|
|
|
stateBloom *trie.SyncBloom // Bloom filter for snap trie node and contract code existence checks
|
2015-04-12 13:38:25 +03:00
|
|
|
|
2015-06-10 01:20:35 +03:00
|
|
|
// Statistics
|
2021-11-26 14:26:03 +03:00
|
|
|
syncStatsChainOrigin uint64 // Origin block number where syncing started at
|
|
|
|
syncStatsChainHeight uint64 // Highest block number known when syncing started
|
2015-10-05 19:37:56 +03:00
|
|
|
syncStatsLock sync.RWMutex // Lock protecting the sync stats fields
|
2015-06-10 01:20:35 +03:00
|
|
|
|
2017-06-27 18:15:29 +03:00
|
|
|
lightchain LightChain
|
2017-07-03 17:17:12 +03:00
|
|
|
blockchain BlockChain
|
2017-06-27 18:15:29 +03:00
|
|
|
|
2015-04-13 17:38:32 +03:00
|
|
|
// Callbacks
|
2017-06-27 18:15:29 +03:00
|
|
|
dropPeer peerDropFn // Drops a peer for misbehaving
|
2015-04-12 13:38:25 +03:00
|
|
|
|
2015-04-13 17:38:32 +03:00
|
|
|
// Status
|
2015-06-11 18:13:13 +03:00
|
|
|
synchroniseMock func(id string, hash common.Hash) error // Replacement for synchronise during testing
|
|
|
|
synchronising int32
|
|
|
|
notified int32
|
2018-02-05 19:40:32 +03:00
|
|
|
committed int32
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
ancientLimit uint64 // The maximum block number which can be regarded as ancient data.
|
2015-04-13 17:38:32 +03:00
|
|
|
|
|
|
|
// Channels
|
2021-11-26 14:26:03 +03:00
|
|
|
headerProcCh chan []*types.Header // Channel to feed the header processor new tasks
|
2015-05-13 13:47:21 +03:00
|
|
|
|
2020-09-08 11:13:16 +03:00
|
|
|
// State sync
|
|
|
|
pivotHeader *types.Header // Pivot block header to dynamically push the syncing state root
|
|
|
|
pivotLock sync.RWMutex // Lock protecting pivot header reads from updates
|
|
|
|
|
2020-12-14 12:27:15 +03:00
|
|
|
snapSync bool // Whether to run state sync over the snap protocol
|
|
|
|
SnapSyncer *snap.Syncer // TODO(karalabe): make private! hack for now
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
stateSyncStart chan *stateSync
|
|
|
|
|
2016-07-26 13:07:12 +03:00
|
|
|
// Cancellation and termination
|
2018-04-16 11:37:48 +03:00
|
|
|
cancelPeer string // Identifier of the peer currently being used as the master (cancel on drop)
|
|
|
|
cancelCh chan struct{} // Channel to cancel mid-flight syncs
|
|
|
|
cancelLock sync.RWMutex // Lock to protect the cancel channel and peer in delivers
|
|
|
|
cancelWg sync.WaitGroup // Make sure all fetcher goroutines have exited.
|
2015-08-14 21:25:41 +03:00
|
|
|
|
2016-06-01 18:07:25 +03:00
|
|
|
quitCh chan struct{} // Quit channel to signal termination
|
2020-10-13 11:58:41 +03:00
|
|
|
quitLock sync.Mutex // Lock to prevent double closes
|
2016-06-01 18:07:25 +03:00
|
|
|
|
2015-08-14 21:25:41 +03:00
|
|
|
// Testing hooks
|
2015-09-28 19:27:31 +03:00
|
|
|
syncInitHook func(uint64, uint64) // Method to call upon initiating a new sync run
|
|
|
|
bodyFetchHook func([]*types.Header) // Method to call upon starting a block body fetch
|
|
|
|
receiptFetchHook func([]*types.Header) // Method to call upon starting a receipt fetch
|
|
|
|
chainInsertHook func([]*fetchResult) // Method to call upon inserting a chain of blocks (possibly in multiple invocations)
|
2015-05-26 14:00:21 +03:00
|
|
|
}
|
|
|
|
|
2017-07-05 12:42:37 +03:00
|
|
|
// LightChain encapsulates functions required to synchronise a light chain.
|
2017-06-27 18:15:29 +03:00
|
|
|
type LightChain interface {
|
|
|
|
// HasHeader verifies a header's presence in the local chain.
|
2018-02-05 19:40:32 +03:00
|
|
|
HasHeader(common.Hash, uint64) bool
|
2017-06-27 18:15:29 +03:00
|
|
|
|
|
|
|
// GetHeaderByHash retrieves a header from the local chain.
|
|
|
|
GetHeaderByHash(common.Hash) *types.Header
|
|
|
|
|
|
|
|
// CurrentHeader retrieves the head header from the local chain.
|
|
|
|
CurrentHeader() *types.Header
|
|
|
|
|
2018-01-30 19:39:32 +03:00
|
|
|
// GetTd returns the total difficulty of a local block.
|
|
|
|
GetTd(common.Hash, uint64) *big.Int
|
2017-06-27 18:15:29 +03:00
|
|
|
|
|
|
|
// InsertHeaderChain inserts a batch of headers into the local chain.
|
|
|
|
InsertHeaderChain([]*types.Header, int) (int, error)
|
|
|
|
|
2020-08-20 13:01:24 +03:00
|
|
|
// SetHead rewinds the local chain to a new head.
|
|
|
|
SetHead(uint64) error
|
2017-06-27 18:15:29 +03:00
|
|
|
}
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// BlockChain encapsulates functions required to sync a (full or snap) blockchain.
|
2017-06-27 18:15:29 +03:00
|
|
|
type BlockChain interface {
|
|
|
|
LightChain
|
|
|
|
|
2018-02-11 15:43:56 +03:00
|
|
|
// HasBlock verifies a block's presence in the local chain.
|
|
|
|
HasBlock(common.Hash, uint64) bool
|
2017-06-27 18:15:29 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// HasFastBlock verifies a snap block's presence in the local chain.
|
2018-11-16 14:15:05 +03:00
|
|
|
HasFastBlock(common.Hash, uint64) bool
|
|
|
|
|
2017-06-27 18:15:29 +03:00
|
|
|
// GetBlockByHash retrieves a block from the local chain.
|
|
|
|
GetBlockByHash(common.Hash) *types.Block
|
|
|
|
|
|
|
|
// CurrentBlock retrieves the head block from the local chain.
|
|
|
|
CurrentBlock() *types.Block
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// CurrentFastBlock retrieves the head snap block from the local chain.
|
2017-06-27 18:15:29 +03:00
|
|
|
CurrentFastBlock() *types.Block
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// SnapSyncCommitHead directly commits the head block to a certain entity.
|
|
|
|
SnapSyncCommitHead(common.Hash) error
|
2017-06-27 18:15:29 +03:00
|
|
|
|
|
|
|
// InsertChain inserts a batch of blocks into the local chain.
|
|
|
|
InsertChain(types.Blocks) (int, error)
|
|
|
|
|
|
|
|
// InsertReceiptChain inserts a batch of receipts into the local chain.
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
InsertReceiptChain(types.Blocks, []types.Receipts, uint64) (int, error)
|
2021-04-29 17:33:45 +03:00
|
|
|
|
|
|
|
// Snapshots returns the blockchain snapshot tree to paused it during sync.
|
|
|
|
Snapshots() *snapshot.Tree
|
2017-06-27 18:15:29 +03:00
|
|
|
}
|
|
|
|
|
2015-06-11 15:56:08 +03:00
|
|
|
// New creates a new downloader to fetch hashes and blocks from remote peers.
|
2019-05-13 15:28:01 +03:00
|
|
|
func New(checkpoint uint64, stateDb ethdb.Database, stateBloom *trie.SyncBloom, mux *event.TypeMux, chain BlockChain, lightchain LightChain, dropPeer peerDropFn) *Downloader {
|
2017-06-27 18:15:29 +03:00
|
|
|
if lightchain == nil {
|
|
|
|
lightchain = chain
|
|
|
|
}
|
2016-06-01 18:07:25 +03:00
|
|
|
dl := &Downloader{
|
2017-06-28 15:25:08 +03:00
|
|
|
stateDB: stateDb,
|
2019-05-13 15:28:01 +03:00
|
|
|
stateBloom: stateBloom,
|
2017-06-28 15:25:08 +03:00
|
|
|
mux: mux,
|
2019-04-16 13:20:38 +03:00
|
|
|
checkpoint: checkpoint,
|
2020-09-02 12:01:46 +03:00
|
|
|
queue: newQueue(blockCacheMaxItems, blockCacheInitialItems),
|
2017-06-28 15:25:08 +03:00
|
|
|
peers: newPeerSet(),
|
2017-07-03 17:17:12 +03:00
|
|
|
blockchain: chain,
|
2017-06-28 15:25:08 +03:00
|
|
|
lightchain: lightchain,
|
|
|
|
dropPeer: dropPeer,
|
|
|
|
headerProcCh: make(chan []*types.Header, 1),
|
|
|
|
quitCh: make(chan struct{}),
|
2021-03-17 11:36:34 +03:00
|
|
|
SnapSyncer: snap.NewSyncer(stateDb),
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
stateSyncStart: make(chan *stateSync),
|
2015-04-12 13:38:25 +03:00
|
|
|
}
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
go dl.stateFetcher()
|
2016-06-01 18:07:25 +03:00
|
|
|
return dl
|
2015-04-12 13:38:25 +03:00
|
|
|
}
|
|
|
|
|
2015-10-13 12:04:25 +03:00
|
|
|
// Progress retrieves the synchronisation boundaries, specifically the origin
|
|
|
|
// block where synchronisation started at (may have failed/suspended); the block
|
|
|
|
// or header sync is currently at; and the latest known block which the sync targets.
|
2016-02-10 11:56:15 +02:00
|
|
|
//
|
2021-11-26 14:26:03 +03:00
|
|
|
// In addition, during the state download phase of snap synchronisation the number
|
2016-02-10 11:56:15 +02:00
|
|
|
// of processed and the total number of known states are also returned. Otherwise
|
|
|
|
// these are zero.
|
2016-09-06 12:39:14 +03:00
|
|
|
func (d *Downloader) Progress() ethereum.SyncProgress {
|
2016-02-10 11:56:15 +02:00
|
|
|
// Lock the current stats and return the progress
|
2015-09-09 19:02:54 +03:00
|
|
|
d.syncStatsLock.RLock()
|
|
|
|
defer d.syncStatsLock.RUnlock()
|
2015-06-10 01:20:35 +03:00
|
|
|
|
2015-10-13 12:04:25 +03:00
|
|
|
current := uint64(0)
|
2020-06-30 20:43:29 +03:00
|
|
|
mode := d.getMode()
|
2019-05-13 15:28:01 +03:00
|
|
|
switch {
|
2020-06-30 20:43:29 +03:00
|
|
|
case d.blockchain != nil && mode == FullSync:
|
2017-07-03 17:17:12 +03:00
|
|
|
current = d.blockchain.CurrentBlock().NumberU64()
|
2021-11-26 14:26:03 +03:00
|
|
|
case d.blockchain != nil && mode == SnapSync:
|
2017-07-03 17:17:12 +03:00
|
|
|
current = d.blockchain.CurrentFastBlock().NumberU64()
|
2019-05-13 15:28:01 +03:00
|
|
|
case d.lightchain != nil:
|
2017-06-27 18:15:29 +03:00
|
|
|
current = d.lightchain.CurrentHeader().Number.Uint64()
|
2019-05-13 15:28:01 +03:00
|
|
|
default:
|
2020-06-30 20:43:29 +03:00
|
|
|
log.Error("Unknown downloader chain/mode combo", "light", d.lightchain != nil, "full", d.blockchain != nil, "mode", mode)
|
2015-10-13 12:04:25 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
progress, pending := d.SnapSyncer.Progress()
|
|
|
|
|
2016-09-06 12:39:14 +03:00
|
|
|
return ethereum.SyncProgress{
|
2021-11-26 14:26:03 +03:00
|
|
|
StartingBlock: d.syncStatsChainOrigin,
|
|
|
|
CurrentBlock: current,
|
|
|
|
HighestBlock: d.syncStatsChainHeight,
|
|
|
|
SyncedAccounts: progress.AccountSynced,
|
|
|
|
SyncedAccountBytes: uint64(progress.AccountBytes),
|
|
|
|
SyncedBytecodes: progress.BytecodeSynced,
|
|
|
|
SyncedBytecodeBytes: uint64(progress.BytecodeBytes),
|
|
|
|
SyncedStorage: progress.StorageSynced,
|
|
|
|
SyncedStorageBytes: uint64(progress.StorageBytes),
|
|
|
|
HealedTrienodes: progress.TrienodeHealSynced,
|
|
|
|
HealedTrienodeBytes: uint64(progress.TrienodeHealBytes),
|
|
|
|
HealedBytecodes: progress.BytecodeHealSynced,
|
|
|
|
HealedBytecodeBytes: uint64(progress.BytecodeHealBytes),
|
|
|
|
HealingTrienodes: pending.TrienodeHeal,
|
|
|
|
HealingBytecode: pending.BytecodeHeal,
|
2016-09-06 12:39:14 +03:00
|
|
|
}
|
2015-04-19 22:45:58 +03:00
|
|
|
}
|
|
|
|
|
2015-06-12 13:35:29 +03:00
|
|
|
// Synchronising returns whether the downloader is currently retrieving blocks.
|
2015-05-15 01:43:00 +03:00
|
|
|
func (d *Downloader) Synchronising() bool {
|
2015-11-13 18:08:15 +02:00
|
|
|
return atomic.LoadInt32(&d.synchronising) > 0
|
2015-05-15 01:43:00 +03:00
|
|
|
}
|
|
|
|
|
2015-05-11 14:26:20 +03:00
|
|
|
// RegisterPeer injects a new download peer into the set of block source to be
|
|
|
|
// used for fetching hashes and blocks from.
|
2020-12-14 12:27:15 +03:00
|
|
|
func (d *Downloader) RegisterPeer(id string, version uint, peer Peer) error {
|
|
|
|
var logger log.Logger
|
|
|
|
if len(id) < 16 {
|
|
|
|
// Tests use short IDs, don't choke on them
|
|
|
|
logger = log.New("peer", id)
|
|
|
|
} else {
|
2021-01-25 09:17:05 +03:00
|
|
|
logger = log.New("peer", id[:8])
|
2020-12-14 12:27:15 +03:00
|
|
|
}
|
2017-02-24 19:23:03 +03:00
|
|
|
logger.Trace("Registering sync peer")
|
2017-06-28 15:25:08 +03:00
|
|
|
if err := d.peers.Register(newPeerConnection(id, version, peer, logger)); err != nil {
|
2017-02-27 14:17:58 +03:00
|
|
|
logger.Error("Failed to register sync peer", "err", err)
|
2015-05-11 14:26:20 +03:00
|
|
|
return err
|
|
|
|
}
|
2015-04-12 13:38:25 +03:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-07-05 12:42:37 +03:00
|
|
|
// RegisterLightPeer injects a light client peer, wrapping it so it appears as a regular peer.
|
2020-12-14 12:27:15 +03:00
|
|
|
func (d *Downloader) RegisterLightPeer(id string, version uint, peer LightPeer) error {
|
2017-07-03 17:17:12 +03:00
|
|
|
return d.RegisterPeer(id, version, &lightPeerWrapper{peer})
|
|
|
|
}
|
|
|
|
|
2015-05-11 14:26:20 +03:00
|
|
|
// UnregisterPeer remove a peer from the known list, preventing any action from
|
2015-09-28 19:27:31 +03:00
|
|
|
// the specified peer. An effort is also made to return any pending fetches into
|
|
|
|
// the queue.
|
2015-05-11 14:26:20 +03:00
|
|
|
func (d *Downloader) UnregisterPeer(id string) error {
|
2016-07-26 13:07:12 +03:00
|
|
|
// Unregister the peer from the active peer set and revoke any fetch tasks
|
2020-12-14 12:27:15 +03:00
|
|
|
var logger log.Logger
|
|
|
|
if len(id) < 16 {
|
|
|
|
// Tests use short IDs, don't choke on them
|
|
|
|
logger = log.New("peer", id)
|
|
|
|
} else {
|
2021-01-25 09:17:05 +03:00
|
|
|
logger = log.New("peer", id[:8])
|
2020-12-14 12:27:15 +03:00
|
|
|
}
|
2017-02-24 19:23:03 +03:00
|
|
|
logger.Trace("Unregistering sync peer")
|
2015-05-11 14:26:20 +03:00
|
|
|
if err := d.peers.Unregister(id); err != nil {
|
2017-02-27 14:17:58 +03:00
|
|
|
logger.Error("Failed to unregister sync peer", "err", err)
|
2015-05-11 14:26:20 +03:00
|
|
|
return err
|
|
|
|
}
|
2015-09-28 19:27:31 +03:00
|
|
|
d.queue.Revoke(id)
|
2016-07-26 13:07:12 +03:00
|
|
|
|
2015-05-11 14:26:20 +03:00
|
|
|
return nil
|
2015-04-12 13:38:25 +03:00
|
|
|
}
|
|
|
|
|
2015-06-11 15:56:08 +03:00
|
|
|
// Synchronise tries to sync up our local block chain with a remote peer, both
|
|
|
|
// adding various sanity checks as well as wrapping it with various log entries.
|
2015-10-28 17:41:01 +03:00
|
|
|
func (d *Downloader) Synchronise(id string, head common.Hash, td *big.Int, mode SyncMode) error {
|
|
|
|
err := d.synchronise(id, head, td, mode)
|
2020-05-29 12:12:43 +03:00
|
|
|
|
2015-10-28 17:41:01 +03:00
|
|
|
switch err {
|
2020-05-29 12:12:43 +03:00
|
|
|
case nil, errBusy, errCanceled:
|
|
|
|
return err
|
|
|
|
}
|
2020-07-09 00:08:08 +03:00
|
|
|
if errors.Is(err, errInvalidChain) || errors.Is(err, errBadPeer) || errors.Is(err, errTimeout) ||
|
|
|
|
errors.Is(err, errStallingPeer) || errors.Is(err, errUnsyncedPeer) || errors.Is(err, errEmptyHeaderSet) ||
|
|
|
|
errors.Is(err, errPeersUnavailable) || errors.Is(err, errTooOld) || errors.Is(err, errInvalidAncestor) {
|
2020-05-29 12:12:43 +03:00
|
|
|
log.Warn("Synchronisation failed, dropping peer", "peer", id, "err", err)
|
|
|
|
if d.dropPeer == nil {
|
|
|
|
// The dropPeer method is nil when `--copydb` is used for a local copy.
|
|
|
|
// Timeouts can occur if e.g. compaction hits at the wrong time, and can be ignored
|
|
|
|
log.Warn("Downloader wants to drop peer, but peerdrop-function is not set", "peer", id)
|
|
|
|
} else {
|
|
|
|
d.dropPeer(id)
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
2020-07-09 00:08:08 +03:00
|
|
|
log.Warn("Synchronisation failed, retrying", "err", err)
|
2015-10-28 17:41:01 +03:00
|
|
|
return err
|
2015-06-11 15:56:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// synchronise will select the peer and use it for synchronising. If an empty string is given
|
2017-11-16 14:14:51 +03:00
|
|
|
// it will use the best peer possible and synchronize if its TD is higher than our own. If any of the
|
2015-04-24 15:40:32 +03:00
|
|
|
// checks fail an error will be returned. This method is synchronous
|
2015-10-13 12:04:25 +03:00
|
|
|
func (d *Downloader) synchronise(id string, hash common.Hash, td *big.Int, mode SyncMode) error {
|
2016-03-15 20:27:49 +02:00
|
|
|
// Mock out the synchronisation if testing
|
2015-06-11 18:13:13 +03:00
|
|
|
if d.synchroniseMock != nil {
|
|
|
|
return d.synchroniseMock(id, hash)
|
|
|
|
}
|
2015-05-07 21:07:20 +03:00
|
|
|
// Make sure only one goroutine is ever allowed past this point at once
|
2015-05-08 15:22:48 +03:00
|
|
|
if !atomic.CompareAndSwapInt32(&d.synchronising, 0, 1) {
|
2015-06-11 15:56:08 +03:00
|
|
|
return errBusy
|
2015-04-19 14:30:34 +03:00
|
|
|
}
|
2015-05-08 15:22:48 +03:00
|
|
|
defer atomic.StoreInt32(&d.synchronising, 0)
|
2015-04-24 15:40:32 +03:00
|
|
|
|
2015-05-13 16:03:05 +03:00
|
|
|
// Post a user notification of the sync (only once per session)
|
|
|
|
if atomic.CompareAndSwapInt32(&d.notified, 0, 1) {
|
2017-02-24 19:23:03 +03:00
|
|
|
log.Info("Block synchronisation started")
|
2015-05-13 16:03:05 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// If we are already full syncing, but have a snap-sync bloom filter laying
|
2020-05-06 16:35:04 +03:00
|
|
|
// around, make sure it doesn't use memory any more. This is a special case
|
2021-11-26 14:26:03 +03:00
|
|
|
// when the user attempts to snap sync a new empty network.
|
2019-05-13 15:28:01 +03:00
|
|
|
if mode == FullSync && d.stateBloom != nil {
|
|
|
|
d.stateBloom.Close()
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// If snap sync was requested, create the snap scheduler and switch to snap
|
|
|
|
// sync mode. Long term we could drop snap sync or merge the two together,
|
2020-12-14 12:27:15 +03:00
|
|
|
// but until snap becomes prevalent, we should support both. TODO(karalabe).
|
|
|
|
if mode == SnapSync {
|
2021-11-26 14:26:03 +03:00
|
|
|
// Snap sync uses the snapshot namespace to store potentially flakey data until
|
|
|
|
// sync completely heals and finishes. Pause snapshot maintenance in the mean-
|
|
|
|
// time to prevent access.
|
|
|
|
if snapshots := d.blockchain.Snapshots(); snapshots != nil { // Only nil in tests
|
|
|
|
snapshots.Disable()
|
2020-12-14 12:27:15 +03:00
|
|
|
}
|
|
|
|
}
|
2015-09-28 19:27:31 +03:00
|
|
|
// Reset the queue, peer set and wake channels to clean any internal leftover state
|
2020-09-02 12:01:46 +03:00
|
|
|
d.queue.Reset(blockCacheMaxItems, blockCacheInitialItems)
|
2015-05-11 14:26:20 +03:00
|
|
|
d.peers.Reset()
|
2015-05-08 17:21:11 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
for _, ch := range []chan bool{d.queue.blockWakeCh, d.queue.receiptWakeCh} {
|
2015-09-28 19:27:31 +03:00
|
|
|
select {
|
|
|
|
case <-ch:
|
|
|
|
default:
|
|
|
|
}
|
2015-09-23 12:39:17 +03:00
|
|
|
}
|
2016-02-25 18:36:42 +02:00
|
|
|
for empty := false; !empty; {
|
|
|
|
select {
|
|
|
|
case <-d.headerProcCh:
|
|
|
|
default:
|
|
|
|
empty = true
|
|
|
|
}
|
|
|
|
}
|
2016-07-26 13:07:12 +03:00
|
|
|
// Create cancel channel for aborting mid-flight and mark the master peer
|
2015-06-18 00:04:57 +03:00
|
|
|
d.cancelLock.Lock()
|
|
|
|
d.cancelCh = make(chan struct{})
|
2016-07-26 13:07:12 +03:00
|
|
|
d.cancelPeer = id
|
2015-06-18 00:04:57 +03:00
|
|
|
d.cancelLock.Unlock()
|
|
|
|
|
2017-03-22 03:37:24 +03:00
|
|
|
defer d.Cancel() // No matter what, we can't leave the cancel channel open
|
2016-05-30 12:01:50 +03:00
|
|
|
|
2020-06-30 20:43:29 +03:00
|
|
|
// Atomically set the requested sync mode
|
|
|
|
atomic.StoreUint32(&d.mode, uint32(mode))
|
2018-02-05 19:40:32 +03:00
|
|
|
|
2015-05-07 21:07:20 +03:00
|
|
|
// Retrieve the origin peer and initiate the downloading process
|
2015-05-11 14:26:20 +03:00
|
|
|
p := d.peers.Peer(id)
|
2015-04-24 15:40:32 +03:00
|
|
|
if p == nil {
|
2015-04-24 16:37:32 +03:00
|
|
|
return errUnknownPeer
|
2015-04-13 17:38:32 +03:00
|
|
|
}
|
2015-07-29 13:20:54 +03:00
|
|
|
return d.syncWithPeer(p, hash, td)
|
2015-05-01 01:23:51 +03:00
|
|
|
}
|
|
|
|
|
2020-06-30 20:43:29 +03:00
|
|
|
func (d *Downloader) getMode() SyncMode {
|
|
|
|
return SyncMode(atomic.LoadUint32(&d.mode))
|
|
|
|
}
|
|
|
|
|
2015-05-11 14:26:20 +03:00
|
|
|
// syncWithPeer starts a block synchronization based on the hash chain from the
|
|
|
|
// specified peer and head hash.
|
2017-06-28 15:25:08 +03:00
|
|
|
func (d *Downloader) syncWithPeer(p *peerConnection, hash common.Hash, td *big.Int) (err error) {
|
2015-05-16 13:29:19 +03:00
|
|
|
d.mux.Post(StartEvent{})
|
2015-05-01 01:23:51 +03:00
|
|
|
defer func() {
|
|
|
|
// reset on error
|
|
|
|
if err != nil {
|
2015-05-15 01:43:00 +03:00
|
|
|
d.mux.Post(FailedEvent{err})
|
|
|
|
} else {
|
cmd,eth: 16400 Add an option to stop geth once in sync. WIP for light mode (#17321)
* cmd, eth: Added in the flag to step geth once sync based on input
* cmd, eth: 16400 Add an option to stop geth once in sync.
* cmd: 16400 Add an option to stop geth once in sync. WIP
* cmd/geth/main, les/fletcher: added in light mode support
* cmd/geth/main, les/fletcher: Cleaned Comments and code for light mode
* cmd: 16400 Fixed formatting issue and cleaned code
* cmd, eth, les: 16400 Fixed formatting issues
* cmd, eth, les: Performed gofmt to update formatting
* cmd, eth, les: Fixed bugs resulting formatting
* cmd/geth, eth/, les: switched to downloader event
* eth: Fixed styling and gen_config
* eth/: Fix nil error in config file
* cmd/geth: Updated countdown log
* les/fetcher.go: Removed depcreated channel
* eth/downloader.go: Removed deprecated select
* cmd/geth, cmd/utils: Fixed minor issues
* eth: Reverted config files to proper format
* eth: Fixed typo in config file
* cmd/geth, eth/down: Updated code to use header time stamp
* eth/downloader: Changed the time threshold to 10 minutes
* cmd/geth, eth/downloader: Updated downloading event to pass latest header
* cmd/geth: Updated main to use right timer object
* cmd/geth: Removed unused failed event
* cmd/geth: added in correct time field with type assertion
* cmd/geth, cmd/utils: Updated flag to use boolean
* cmd/geth, cmd/utils, eth/downloader: Cleaned up code based on recommendations
* cmd/geth: Removed unneeded import
* cmd/geth, eth/downloader: fixed event field and suggested changes
* cmd/geth, cmd/utils: Updated flag and linting issue
2019-01-30 10:40:36 +03:00
|
|
|
latest := d.lightchain.CurrentHeader()
|
|
|
|
d.mux.Post(DoneEvent{latest})
|
2015-05-01 01:23:51 +03:00
|
|
|
}
|
|
|
|
}()
|
2021-08-24 21:52:58 +03:00
|
|
|
if p.version < eth.ETH66 {
|
|
|
|
return fmt.Errorf("%w: advertized %d < required %d", errTooOld, p.version, eth.ETH66)
|
2016-07-21 12:36:38 +03:00
|
|
|
}
|
2020-06-30 20:43:29 +03:00
|
|
|
mode := d.getMode()
|
2015-04-24 15:40:32 +03:00
|
|
|
|
2020-06-30 20:43:29 +03:00
|
|
|
log.Debug("Synchronising with the network", "peer", p.id, "eth", p.version, "head", hash, "td", td, "mode", mode)
|
2015-09-30 19:23:31 +03:00
|
|
|
defer func(start time.Time) {
|
2019-05-15 14:33:33 +03:00
|
|
|
log.Debug("Synchronisation terminated", "elapsed", common.PrettyDuration(time.Since(start)))
|
2015-09-30 19:23:31 +03:00
|
|
|
}(time.Now())
|
2015-08-14 21:25:41 +03:00
|
|
|
|
2016-07-21 12:36:38 +03:00
|
|
|
// Look up the sync boundaries: the common ancestor and the target block
|
2020-09-08 11:13:16 +03:00
|
|
|
latest, pivot, err := d.fetchHead(p)
|
2016-07-21 12:36:38 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync && pivot == nil {
|
2020-09-08 11:13:16 +03:00
|
|
|
// If no pivot block was returned, the head is below the min full block
|
2021-11-26 14:26:03 +03:00
|
|
|
// threshold (i.e. new chain). In that case we won't really snap sync
|
2020-09-08 11:13:16 +03:00
|
|
|
// anyway, but still need a valid pivot block to avoid some code hitting
|
|
|
|
// nil panics on an access.
|
|
|
|
pivot = d.blockchain.CurrentBlock().Header()
|
|
|
|
}
|
2016-07-21 12:36:38 +03:00
|
|
|
height := latest.Number.Uint64()
|
2015-09-09 19:02:54 +03:00
|
|
|
|
2018-11-12 16:18:56 +03:00
|
|
|
origin, err := d.findAncestor(p, latest)
|
2016-07-21 12:36:38 +03:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
d.syncStatsLock.Lock()
|
|
|
|
if d.syncStatsChainHeight <= origin || d.syncStatsChainOrigin > origin {
|
|
|
|
d.syncStatsChainOrigin = origin
|
|
|
|
}
|
|
|
|
d.syncStatsChainHeight = height
|
|
|
|
d.syncStatsLock.Unlock()
|
2016-05-27 14:26:00 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// Ensure our origin point is below any snap sync pivot point
|
|
|
|
if mode == SnapSync {
|
2018-02-05 19:40:32 +03:00
|
|
|
if height <= uint64(fsMinFullBlocks) {
|
|
|
|
origin = 0
|
2016-07-21 12:36:38 +03:00
|
|
|
} else {
|
2020-09-08 11:13:16 +03:00
|
|
|
pivotNumber := pivot.Number.Uint64()
|
|
|
|
if pivotNumber <= origin {
|
|
|
|
origin = pivotNumber - 1
|
2016-07-21 12:36:38 +03:00
|
|
|
}
|
2020-08-20 13:01:24 +03:00
|
|
|
// Write out the pivot into the database so a rollback beyond it will
|
2021-11-26 14:26:03 +03:00
|
|
|
// reenable snap sync
|
2020-09-08 11:13:16 +03:00
|
|
|
rawdb.WriteLastPivotNumber(d.stateDB, pivotNumber)
|
2015-09-09 19:02:54 +03:00
|
|
|
}
|
2016-07-21 12:36:38 +03:00
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
d.committed = 1
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync && pivot.Number.Uint64() != 0 {
|
2018-02-05 19:40:32 +03:00
|
|
|
d.committed = 0
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync {
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
// Set the ancient data limitation.
|
2021-11-26 14:26:03 +03:00
|
|
|
// If we are running snap sync, all block data older than ancientLimit will be
|
core, cmd, vendor: fixes and database inspection tool (#15)
* core, eth: some fixes for freezer
* vendor, core/rawdb, cmd/geth: add db inspector
* core, cmd/utils: check ancient store path forceily
* cmd/geth, common, core/rawdb: a few fixes
* cmd/geth: support windows file rename and fix rename error
* core: support ancient plugin
* core, cmd: streaming file copy
* cmd, consensus, core, tests: keep genesis in leveldb
* core: write txlookup during ancient init
* core: bump database version
2019-05-14 17:07:44 +03:00
|
|
|
// written to the ancient store. More recent data will be written to the active
|
|
|
|
// database and will wait for the freezer to migrate.
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
//
|
core, cmd, vendor: fixes and database inspection tool (#15)
* core, eth: some fixes for freezer
* vendor, core/rawdb, cmd/geth: add db inspector
* core, cmd/utils: check ancient store path forceily
* cmd/geth, common, core/rawdb: a few fixes
* cmd/geth: support windows file rename and fix rename error
* core: support ancient plugin
* core, cmd: streaming file copy
* cmd, consensus, core, tests: keep genesis in leveldb
* core: write txlookup during ancient init
* core: bump database version
2019-05-14 17:07:44 +03:00
|
|
|
// If there is a checkpoint available, then calculate the ancientLimit through
|
|
|
|
// that. Otherwise calculate the ancient limit through the advertised height
|
|
|
|
// of the remote peer.
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
//
|
core, cmd, vendor: fixes and database inspection tool (#15)
* core, eth: some fixes for freezer
* vendor, core/rawdb, cmd/geth: add db inspector
* core, cmd/utils: check ancient store path forceily
* cmd/geth, common, core/rawdb: a few fixes
* cmd/geth: support windows file rename and fix rename error
* core: support ancient plugin
* core, cmd: streaming file copy
* cmd, consensus, core, tests: keep genesis in leveldb
* core: write txlookup during ancient init
* core: bump database version
2019-05-14 17:07:44 +03:00
|
|
|
// The reason for picking checkpoint first is that a malicious peer can give us
|
|
|
|
// a fake (very high) height, forcing the ancient limit to also be very high.
|
|
|
|
// The peer would start to feed us valid blocks until head, resulting in all of
|
|
|
|
// the blocks might be written into the ancient store. A following mini-reorg
|
|
|
|
// could cause issues.
|
2020-07-13 12:02:54 +03:00
|
|
|
if d.checkpoint != 0 && d.checkpoint > fullMaxForkAncestry+1 {
|
core, cmd, vendor: fixes and database inspection tool (#15)
* core, eth: some fixes for freezer
* vendor, core/rawdb, cmd/geth: add db inspector
* core, cmd/utils: check ancient store path forceily
* cmd/geth, common, core/rawdb: a few fixes
* cmd/geth: support windows file rename and fix rename error
* core: support ancient plugin
* core, cmd: streaming file copy
* cmd, consensus, core, tests: keep genesis in leveldb
* core: write txlookup during ancient init
* core: bump database version
2019-05-14 17:07:44 +03:00
|
|
|
d.ancientLimit = d.checkpoint
|
2020-07-13 12:02:54 +03:00
|
|
|
} else if height > fullMaxForkAncestry+1 {
|
|
|
|
d.ancientLimit = height - fullMaxForkAncestry - 1
|
2020-10-09 09:58:30 +03:00
|
|
|
} else {
|
|
|
|
d.ancientLimit = 0
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
}
|
|
|
|
frozen, _ := d.stateDB.Ancients() // Ignore the error here since light client can also hit here.
|
2020-08-20 13:01:24 +03:00
|
|
|
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
// If a part of blockchain data has already been written into active store,
|
|
|
|
// disable the ancient style insertion explicitly.
|
|
|
|
if origin >= frozen && frozen != 0 {
|
|
|
|
d.ancientLimit = 0
|
|
|
|
log.Info("Disabling direct-ancient mode", "origin", origin, "ancient", frozen-1)
|
|
|
|
} else if d.ancientLimit > 0 {
|
|
|
|
log.Debug("Enabling direct-ancient mode", "ancient", d.ancientLimit)
|
|
|
|
}
|
|
|
|
// Rewind the ancient store and blockchain if reorg happens.
|
|
|
|
if origin+1 < frozen {
|
2021-11-22 12:11:59 +03:00
|
|
|
if err := d.lightchain.SetHead(origin); err != nil {
|
2020-08-20 13:01:24 +03:00
|
|
|
return err
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
// Initiate the sync using a concurrent header and content retrieval algorithm
|
2020-06-30 20:43:29 +03:00
|
|
|
d.queue.Prepare(origin+1, mode)
|
2016-07-21 12:36:38 +03:00
|
|
|
if d.syncInitHook != nil {
|
|
|
|
d.syncInitHook(origin, height)
|
2015-04-12 13:38:25 +03:00
|
|
|
}
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
fetchers := []func() error{
|
2021-11-26 14:26:03 +03:00
|
|
|
func() error { return d.fetchHeaders(p, origin+1, latest.Number.Uint64()) }, // Headers are always retrieved
|
|
|
|
func() error { return d.fetchBodies(origin + 1) }, // Bodies are retrieved during normal and snap sync
|
|
|
|
func() error { return d.fetchReceipts(origin + 1) }, // Receipts are retrieved during snap sync
|
2020-09-08 11:13:16 +03:00
|
|
|
func() error { return d.processHeaders(origin+1, td) },
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync {
|
2020-09-08 11:13:16 +03:00
|
|
|
d.pivotLock.Lock()
|
|
|
|
d.pivotHeader = pivot
|
|
|
|
d.pivotLock.Unlock()
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
fetchers = append(fetchers, func() error { return d.processSnapSyncContent() })
|
2020-06-30 20:43:29 +03:00
|
|
|
} else if mode == FullSync {
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
fetchers = append(fetchers, d.processFullSyncContent)
|
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
return d.spawnSync(fetchers)
|
2015-11-13 18:08:15 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// spawnSync runs d.process and all given fetcher functions to completion in
|
|
|
|
// separate goroutines, returning the first error that appears.
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
func (d *Downloader) spawnSync(fetchers []func() error) error {
|
|
|
|
errc := make(chan error, len(fetchers))
|
2018-04-16 11:37:48 +03:00
|
|
|
d.cancelWg.Add(len(fetchers))
|
2015-11-13 18:08:15 +02:00
|
|
|
for _, fn := range fetchers {
|
|
|
|
fn := fn
|
2018-04-16 11:37:48 +03:00
|
|
|
go func() { defer d.cancelWg.Done(); errc <- fn() }()
|
2015-11-13 18:08:15 +02:00
|
|
|
}
|
|
|
|
// Wait for the first error, then terminate the others.
|
|
|
|
var err error
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
for i := 0; i < len(fetchers); i++ {
|
|
|
|
if i == len(fetchers)-1 {
|
2015-11-13 18:08:15 +02:00
|
|
|
// Close the queue when all fetchers have exited.
|
|
|
|
// This will cause the block processor to end when
|
|
|
|
// it has processed the queue.
|
|
|
|
d.queue.Close()
|
|
|
|
}
|
2019-06-05 15:00:46 +03:00
|
|
|
if err = <-errc; err != nil && err != errCanceled {
|
2015-11-13 18:08:15 +02:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
d.queue.Close()
|
2017-03-22 03:37:24 +03:00
|
|
|
d.Cancel()
|
2015-11-13 18:08:15 +02:00
|
|
|
return err
|
2015-04-12 13:38:25 +03:00
|
|
|
}
|
|
|
|
|
2018-04-23 10:01:21 +03:00
|
|
|
// cancel aborts all of the operations and resets the queue. However, cancel does
|
|
|
|
// not wait for the running download goroutines to finish. This method should be
|
|
|
|
// used when cancelling the downloads from inside the downloader.
|
|
|
|
func (d *Downloader) cancel() {
|
2015-05-13 14:01:08 +03:00
|
|
|
// Close the current cancel channel
|
2015-05-15 19:43:42 +03:00
|
|
|
d.cancelLock.Lock()
|
2020-04-27 11:22:15 +03:00
|
|
|
defer d.cancelLock.Unlock()
|
|
|
|
|
2015-06-12 13:35:29 +03:00
|
|
|
if d.cancelCh != nil {
|
|
|
|
select {
|
|
|
|
case <-d.cancelCh:
|
|
|
|
// Channel was already closed
|
|
|
|
default:
|
|
|
|
close(d.cancelCh)
|
|
|
|
}
|
2015-05-15 19:43:42 +03:00
|
|
|
}
|
2018-04-23 10:01:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Cancel aborts all of the operations and waits for all download goroutines to
|
|
|
|
// finish before returning.
|
|
|
|
func (d *Downloader) Cancel() {
|
|
|
|
d.cancel()
|
2018-04-16 11:37:48 +03:00
|
|
|
d.cancelWg.Wait()
|
2015-05-10 01:34:07 +03:00
|
|
|
}
|
|
|
|
|
2015-06-18 00:04:57 +03:00
|
|
|
// Terminate interrupts the downloader, canceling all pending operations.
|
2015-11-13 18:08:15 +02:00
|
|
|
// The downloader cannot be reused after calling Terminate.
|
2015-06-18 00:04:57 +03:00
|
|
|
func (d *Downloader) Terminate() {
|
2016-06-01 18:07:25 +03:00
|
|
|
// Close the termination channel (make sure double close is allowed)
|
|
|
|
d.quitLock.Lock()
|
|
|
|
select {
|
|
|
|
case <-d.quitCh:
|
|
|
|
default:
|
|
|
|
close(d.quitCh)
|
|
|
|
}
|
2020-07-24 10:46:26 +03:00
|
|
|
if d.stateBloom != nil {
|
|
|
|
d.stateBloom.Close()
|
|
|
|
}
|
2016-06-01 18:07:25 +03:00
|
|
|
d.quitLock.Unlock()
|
|
|
|
|
|
|
|
// Cancel any pending download requests
|
2017-03-22 03:37:24 +03:00
|
|
|
d.Cancel()
|
2015-06-18 00:04:57 +03:00
|
|
|
}
|
|
|
|
|
2020-09-08 11:13:16 +03:00
|
|
|
// fetchHead retrieves the head header and prior pivot block (if available) from
|
|
|
|
// a remote peer.
|
|
|
|
func (d *Downloader) fetchHead(p *peerConnection) (head *types.Header, pivot *types.Header, err error) {
|
|
|
|
p.log.Debug("Retrieving remote chain head")
|
|
|
|
mode := d.getMode()
|
2015-09-09 19:02:54 +03:00
|
|
|
|
|
|
|
// Request the advertised remote head block and wait for the response
|
2020-09-08 11:13:16 +03:00
|
|
|
latest, _ := p.peer.Head()
|
|
|
|
fetch := 1
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync {
|
2020-09-08 11:13:16 +03:00
|
|
|
fetch = 2 // head + pivot headers
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
headers, err := d.fetchHeadersByHash(p, latest, fetch, fsMinFullBlocks-1, true)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
// Make sure the peer gave us at least one and at most the requested headers
|
|
|
|
if len(headers) == 0 || len(headers) > fetch {
|
|
|
|
return nil, nil, fmt.Errorf("%w: returned headers %d != requested %d", errBadPeer, len(headers), fetch)
|
|
|
|
}
|
|
|
|
// The first header needs to be the head, validate against the checkpoint
|
|
|
|
// and request. If only 1 header was returned, make sure there's no pivot
|
|
|
|
// or there was not one requested.
|
|
|
|
head = headers[0]
|
|
|
|
if (mode == SnapSync || mode == LightSync) && head.Number.Uint64() < d.checkpoint {
|
|
|
|
return nil, nil, fmt.Errorf("%w: remote head %d below checkpoint %d", errUnsyncedPeer, head.Number, d.checkpoint)
|
|
|
|
}
|
|
|
|
if len(headers) == 1 {
|
|
|
|
if mode == SnapSync && head.Number.Uint64() > uint64(fsMinFullBlocks) {
|
|
|
|
return nil, nil, fmt.Errorf("%w: no pivot included along head header", errBadPeer)
|
2015-09-09 19:02:54 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
p.log.Debug("Remote head identified, no pivot", "number", head.Number, "hash", head.Hash())
|
|
|
|
return head, nil, nil
|
|
|
|
}
|
|
|
|
// At this point we have 2 headers in total and the first is the
|
|
|
|
// validated head of the chain. Check the pivot number and return,
|
|
|
|
pivot = headers[1]
|
|
|
|
if pivot.Number.Uint64() != head.Number.Uint64()-uint64(fsMinFullBlocks) {
|
|
|
|
return nil, nil, fmt.Errorf("%w: remote pivot %d != requested %d", errInvalidChain, pivot.Number, head.Number.Uint64()-uint64(fsMinFullBlocks))
|
2015-09-09 19:02:54 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
return head, pivot, nil
|
2015-09-09 19:02:54 +03:00
|
|
|
}
|
|
|
|
|
2018-11-12 16:18:56 +03:00
|
|
|
// calculateRequestSpan calculates what headers to request from a peer when trying to determine the
|
|
|
|
// common ancestor.
|
|
|
|
// It returns parameters to be used for peer.RequestHeadersByNumber:
|
|
|
|
// from - starting block number
|
|
|
|
// count - number of headers to request
|
|
|
|
// skip - number of headers to skip
|
|
|
|
// and also returns 'max', the last block which is expected to be returned by the remote peers,
|
|
|
|
// given the (from,count,skip)
|
|
|
|
func calculateRequestSpan(remoteHeight, localHeight uint64) (int64, int, int, uint64) {
|
|
|
|
var (
|
|
|
|
from int
|
|
|
|
count int
|
|
|
|
MaxCount = MaxHeaderFetch / 16
|
|
|
|
)
|
|
|
|
// requestHead is the highest block that we will ask for. If requestHead is not offset,
|
|
|
|
// the highest block that we will get is 16 blocks back from head, which means we
|
|
|
|
// will fetch 14 or 15 blocks unnecessarily in the case the height difference
|
|
|
|
// between us and the peer is 1-2 blocks, which is most common
|
|
|
|
requestHead := int(remoteHeight) - 1
|
|
|
|
if requestHead < 0 {
|
|
|
|
requestHead = 0
|
|
|
|
}
|
|
|
|
// requestBottom is the lowest block we want included in the query
|
2020-05-06 16:35:04 +03:00
|
|
|
// Ideally, we want to include the one just below our own head
|
2018-11-12 16:18:56 +03:00
|
|
|
requestBottom := int(localHeight - 1)
|
|
|
|
if requestBottom < 0 {
|
|
|
|
requestBottom = 0
|
|
|
|
}
|
|
|
|
totalSpan := requestHead - requestBottom
|
|
|
|
span := 1 + totalSpan/MaxCount
|
|
|
|
if span < 2 {
|
|
|
|
span = 2
|
|
|
|
}
|
|
|
|
if span > 16 {
|
|
|
|
span = 16
|
|
|
|
}
|
|
|
|
|
|
|
|
count = 1 + totalSpan/span
|
|
|
|
if count > MaxCount {
|
|
|
|
count = MaxCount
|
|
|
|
}
|
|
|
|
if count < 2 {
|
|
|
|
count = 2
|
|
|
|
}
|
|
|
|
from = requestHead - (count-1)*span
|
|
|
|
if from < 0 {
|
|
|
|
from = 0
|
|
|
|
}
|
|
|
|
max := from + (count-1)*span
|
|
|
|
return int64(from), count, span - 1, uint64(max)
|
|
|
|
}
|
|
|
|
|
2015-09-28 19:27:31 +03:00
|
|
|
// findAncestor tries to locate the common ancestor link of the local chain and
|
2015-08-14 21:25:41 +03:00
|
|
|
// a remote peers blockchain. In the general case when our node was in sync and
|
2015-09-28 19:27:31 +03:00
|
|
|
// on the correct chain, checking the top N links should already get us a match.
|
2016-03-15 20:55:39 +02:00
|
|
|
// In the rare scenario when we ended up on a long reorganisation (i.e. none of
|
2015-09-28 19:27:31 +03:00
|
|
|
// the head links match), we do a binary search to find the common ancestor.
|
2018-11-12 16:18:56 +03:00
|
|
|
func (d *Downloader) findAncestor(p *peerConnection, remoteHeader *types.Header) (uint64, error) {
|
2016-05-13 13:12:13 +03:00
|
|
|
// Figure out the valid ancestor range to prevent rewrite attacks
|
2018-11-12 16:18:56 +03:00
|
|
|
var (
|
|
|
|
floor = int64(-1)
|
|
|
|
localHeight uint64
|
|
|
|
remoteHeight = remoteHeader.Number.Uint64()
|
|
|
|
)
|
2020-06-30 20:43:29 +03:00
|
|
|
mode := d.getMode()
|
|
|
|
switch mode {
|
2018-11-12 16:18:56 +03:00
|
|
|
case FullSync:
|
|
|
|
localHeight = d.blockchain.CurrentBlock().NumberU64()
|
2021-11-26 14:26:03 +03:00
|
|
|
case SnapSync:
|
2018-11-12 16:18:56 +03:00
|
|
|
localHeight = d.blockchain.CurrentFastBlock().NumberU64()
|
|
|
|
default:
|
|
|
|
localHeight = d.lightchain.CurrentHeader().Number.Uint64()
|
2016-05-13 13:12:13 +03:00
|
|
|
}
|
2018-11-12 16:18:56 +03:00
|
|
|
p.log.Debug("Looking for common ancestor", "local", localHeight, "remote", remoteHeight)
|
2019-03-14 13:19:03 +03:00
|
|
|
|
|
|
|
// Recap floor value for binary search
|
2020-07-13 12:02:54 +03:00
|
|
|
maxForkAncestry := fullMaxForkAncestry
|
|
|
|
if d.getMode() == LightSync {
|
|
|
|
maxForkAncestry = lightMaxForkAncestry
|
|
|
|
}
|
2019-05-15 14:33:33 +03:00
|
|
|
if localHeight >= maxForkAncestry {
|
2018-11-28 14:31:42 +03:00
|
|
|
// We're above the max reorg threshold, find the earliest fork point
|
2019-05-15 14:33:33 +03:00
|
|
|
floor = int64(localHeight - maxForkAncestry)
|
2019-03-14 13:19:03 +03:00
|
|
|
}
|
|
|
|
// If we're doing a light sync, ensure the floor doesn't go below the CHT, as
|
|
|
|
// all headers before that point will be missing.
|
2020-06-30 20:43:29 +03:00
|
|
|
if mode == LightSync {
|
2020-05-25 11:21:28 +03:00
|
|
|
// If we don't know the current CHT position, find it
|
2019-03-14 13:19:03 +03:00
|
|
|
if d.genesis == 0 {
|
|
|
|
header := d.lightchain.CurrentHeader()
|
|
|
|
for header != nil {
|
|
|
|
d.genesis = header.Number.Uint64()
|
|
|
|
if floor >= int64(d.genesis)-1 {
|
|
|
|
break
|
2018-11-28 14:31:42 +03:00
|
|
|
}
|
2019-03-14 13:19:03 +03:00
|
|
|
header = d.lightchain.GetHeaderByHash(header.ParentHash)
|
2018-11-28 14:31:42 +03:00
|
|
|
}
|
2019-03-14 13:19:03 +03:00
|
|
|
}
|
|
|
|
// We already know the "genesis" block number, cap floor to that
|
|
|
|
if floor < int64(d.genesis)-1 {
|
|
|
|
floor = int64(d.genesis) - 1
|
2018-11-28 14:31:42 +03:00
|
|
|
}
|
2016-05-13 13:12:13 +03:00
|
|
|
}
|
2019-03-14 13:19:03 +03:00
|
|
|
|
2021-01-21 00:45:01 +03:00
|
|
|
ancestor, err := d.findAncestorSpanSearch(p, mode, remoteHeight, localHeight, floor)
|
|
|
|
if err == nil {
|
|
|
|
return ancestor, nil
|
|
|
|
}
|
|
|
|
// The returned error was not nil.
|
|
|
|
// If the error returned does not reflect that a common ancestor was not found, return it.
|
|
|
|
// If the error reflects that a common ancestor was not found, continue to binary search,
|
|
|
|
// where the error value will be reassigned.
|
|
|
|
if !errors.Is(err, errNoAncestorFound) {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
ancestor, err = d.findAncestorBinarySearch(p, mode, remoteHeight, floor)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return ancestor, nil
|
|
|
|
}
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
func (d *Downloader) findAncestorSpanSearch(p *peerConnection, mode SyncMode, remoteHeight, localHeight uint64, floor int64) (uint64, error) {
|
2018-11-12 16:18:56 +03:00
|
|
|
from, count, skip, max := calculateRequestSpan(remoteHeight, localHeight)
|
2018-11-16 14:15:05 +03:00
|
|
|
|
|
|
|
p.log.Trace("Span searching for common ancestor", "count", count, "from", from, "skip", skip)
|
2021-11-26 14:26:03 +03:00
|
|
|
headers, err := d.fetchHeadersByNumber(p, uint64(from), count, skip, false)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2015-08-14 21:25:41 +03:00
|
|
|
// Wait for the remote response to the head fetch
|
|
|
|
number, hash := uint64(0), common.Hash{}
|
2017-02-24 19:23:03 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// Make sure the peer actually gave something valid
|
|
|
|
if len(headers) == 0 {
|
|
|
|
p.log.Warn("Empty head header set")
|
|
|
|
return 0, errEmptyHeaderSet
|
|
|
|
}
|
|
|
|
// Make sure the peer's reply conforms to the request
|
|
|
|
for i, header := range headers {
|
|
|
|
expectNumber := from + int64(i)*int64(skip+1)
|
|
|
|
if number := header.Number.Int64(); number != expectNumber {
|
|
|
|
p.log.Warn("Head headers broke chain ordering", "index", i, "requested", expectNumber, "received", number)
|
|
|
|
return 0, fmt.Errorf("%w: %v", errInvalidChain, errors.New("head headers broke chain ordering"))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Check if a common ancestor was found
|
|
|
|
for i := len(headers) - 1; i >= 0; i-- {
|
|
|
|
// Skip any headers that underflow/overflow our requested set
|
|
|
|
if headers[i].Number.Int64() < from || headers[i].Number.Uint64() > max {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Otherwise check if we already know the header or not
|
|
|
|
h := headers[i].Hash()
|
|
|
|
n := headers[i].Number.Uint64()
|
|
|
|
|
|
|
|
var known bool
|
|
|
|
switch mode {
|
|
|
|
case FullSync:
|
|
|
|
known = d.blockchain.HasBlock(h, n)
|
|
|
|
case SnapSync:
|
|
|
|
known = d.blockchain.HasFastBlock(h, n)
|
|
|
|
default:
|
|
|
|
known = d.lightchain.HasHeader(h, n)
|
|
|
|
}
|
|
|
|
if known {
|
|
|
|
number, hash = n, h
|
|
|
|
break
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// If the head fetch already found an ancestor, return
|
2018-05-29 13:42:21 +03:00
|
|
|
if hash != (common.Hash{}) {
|
2016-05-13 13:12:13 +03:00
|
|
|
if int64(number) <= floor {
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Warn("Ancestor below allowance", "number", number, "hash", hash, "allowance", floor)
|
2016-05-13 13:12:13 +03:00
|
|
|
return 0, errInvalidAncestor
|
|
|
|
}
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Debug("Found common ancestor", "number", number, "hash", hash)
|
2015-08-14 21:25:41 +03:00
|
|
|
return number, nil
|
|
|
|
}
|
2021-01-21 00:45:01 +03:00
|
|
|
return 0, errNoAncestorFound
|
|
|
|
}
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
func (d *Downloader) findAncestorBinarySearch(p *peerConnection, mode SyncMode, remoteHeight uint64, floor int64) (uint64, error) {
|
2021-01-21 00:45:01 +03:00
|
|
|
hash := common.Hash{}
|
|
|
|
|
2015-08-14 21:25:41 +03:00
|
|
|
// Ancestor not found, we need to binary search over our chain
|
2018-11-12 16:18:56 +03:00
|
|
|
start, end := uint64(0), remoteHeight
|
2016-05-13 13:12:13 +03:00
|
|
|
if floor > 0 {
|
|
|
|
start = uint64(floor)
|
|
|
|
}
|
2018-11-16 14:15:05 +03:00
|
|
|
p.log.Trace("Binary searching for common ancestor", "start", start, "end", end)
|
|
|
|
|
2015-08-14 21:25:41 +03:00
|
|
|
for start+1 < end {
|
|
|
|
// Split our chain interval in two, and request the hash to cross check
|
|
|
|
check := (start + end) / 2
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
headers, err := d.fetchHeadersByNumber(p, check, 1, 0, false)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
// Make sure the peer actually gave something valid
|
|
|
|
if len(headers) != 1 {
|
|
|
|
p.log.Warn("Multiple headers for single request", "headers", len(headers))
|
|
|
|
return 0, fmt.Errorf("%w: multiple headers (%d) for single request", errBadPeer, len(headers))
|
|
|
|
}
|
|
|
|
// Modify the search interval based on the response
|
|
|
|
h := headers[0].Hash()
|
|
|
|
n := headers[0].Number.Uint64()
|
|
|
|
|
|
|
|
var known bool
|
|
|
|
switch mode {
|
|
|
|
case FullSync:
|
|
|
|
known = d.blockchain.HasBlock(h, n)
|
|
|
|
case SnapSync:
|
|
|
|
known = d.blockchain.HasFastBlock(h, n)
|
|
|
|
default:
|
|
|
|
known = d.lightchain.HasHeader(h, n)
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
if !known {
|
|
|
|
end = check
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
header := d.lightchain.GetHeaderByHash(h) // Independent of sync mode, header surely exists
|
|
|
|
if header.Number.Uint64() != check {
|
|
|
|
p.log.Warn("Received non requested header", "number", header.Number, "hash", header.Hash(), "request", check)
|
|
|
|
return 0, fmt.Errorf("%w: non-requested header (%d)", errBadPeer, header.Number)
|
|
|
|
}
|
|
|
|
start = check
|
|
|
|
hash = h
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2016-05-13 13:12:13 +03:00
|
|
|
// Ensure valid ancestry and return
|
|
|
|
if int64(start) <= floor {
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Warn("Ancestor below allowance", "number", start, "hash", hash, "allowance", floor)
|
2016-05-13 13:12:13 +03:00
|
|
|
return 0, errInvalidAncestor
|
|
|
|
}
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Debug("Found common ancestor", "number", start, "hash", hash)
|
2015-08-14 21:25:41 +03:00
|
|
|
return start, nil
|
|
|
|
}
|
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
// fetchHeaders keeps retrieving headers concurrently from the number
|
|
|
|
// requested, until no more are returned, potentially throttling on the way. To
|
|
|
|
// facilitate concurrency but still protect against malicious nodes sending bad
|
|
|
|
// headers, we construct a header chain skeleton using the "origin" peer we are
|
|
|
|
// syncing with, and fill in the missing headers using anyone else. Headers from
|
2016-05-17 11:12:57 +03:00
|
|
|
// other peers are only accepted if they map cleanly to the skeleton. If no one
|
2016-02-25 18:36:42 +02:00
|
|
|
// can fill in the skeleton - not even the origin peer - it's assumed invalid and
|
|
|
|
// the origin is dropped.
|
2021-11-26 14:26:03 +03:00
|
|
|
func (d *Downloader) fetchHeaders(p *peerConnection, from uint64, head uint64) error {
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Debug("Directing header downloads", "origin", from)
|
|
|
|
defer p.log.Debug("Header download terminated")
|
2015-08-14 21:25:41 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// Start pulling the header chain skeleton until all is done
|
|
|
|
var (
|
|
|
|
skeleton = true // Skeleton assembly phase or finishing up
|
|
|
|
pivoting = false // Whether the next request is pivot verification
|
|
|
|
ancestor = from
|
|
|
|
mode = d.getMode()
|
|
|
|
)
|
|
|
|
for {
|
|
|
|
// Pull the next batch of headers, it either:
|
|
|
|
// - Pivot check to see if the chain moved too far
|
|
|
|
// - Skeleton retrieval to permit concurrent header fetches
|
|
|
|
// - Full header retrieval if we're near the chain head
|
|
|
|
var (
|
|
|
|
headers []*types.Header
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
switch {
|
|
|
|
case pivoting:
|
|
|
|
d.pivotLock.RLock()
|
|
|
|
pivot := d.pivotHeader.Number.Uint64()
|
|
|
|
d.pivotLock.RUnlock()
|
2017-02-24 19:23:03 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
p.log.Trace("Fetching next pivot header", "number", pivot+uint64(fsMinFullBlocks))
|
|
|
|
headers, err = d.fetchHeadersByNumber(p, pivot+uint64(fsMinFullBlocks), 2, fsMinFullBlocks-9, false) // move +64 when it's 2x64-8 deep
|
2016-05-17 11:12:57 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
case skeleton:
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Trace("Fetching skeleton headers", "count", MaxHeaderFetch, "from", from)
|
2021-11-26 14:26:03 +03:00
|
|
|
headers, err = d.fetchHeadersByNumber(p, from+uint64(MaxHeaderFetch)-1, MaxSkeletonSize, MaxHeaderFetch-1, false)
|
|
|
|
|
|
|
|
default:
|
2017-03-02 16:06:16 +03:00
|
|
|
p.log.Trace("Fetching full headers", "count", MaxHeaderFetch, "from", from)
|
2021-11-26 14:26:03 +03:00
|
|
|
headers, err = d.fetchHeadersByNumber(p, from, MaxHeaderFetch, 0, false)
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
switch err {
|
|
|
|
case nil:
|
|
|
|
// Headers retrieved, continue with processing
|
2020-09-08 11:13:16 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
case errCanceled:
|
|
|
|
// Sync cancelled, no issue, propagate up
|
|
|
|
return err
|
2015-08-14 21:25:41 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
default:
|
|
|
|
// Header retrieval either timed out, or the peer failed in some strange way
|
|
|
|
// (e.g. disconnect). Consider the master peer bad and drop
|
|
|
|
d.dropPeer(p.id)
|
2015-08-14 21:25:41 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// Finish the sync gracefully instead of dumping the gathered data though
|
|
|
|
for _, ch := range []chan bool{d.queue.blockWakeCh, d.queue.receiptWakeCh} {
|
|
|
|
select {
|
|
|
|
case ch <- false:
|
|
|
|
case <-d.cancelCh:
|
|
|
|
}
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
select {
|
|
|
|
case d.headerProcCh <- nil:
|
|
|
|
case <-d.cancelCh:
|
2020-09-08 11:13:16 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
return fmt.Errorf("%w: header request failed: %v", errBadPeer, err)
|
|
|
|
}
|
|
|
|
// If the pivot is being checked, move if it became stale and run the real retrieval
|
|
|
|
var pivot uint64
|
2020-09-08 11:13:16 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
d.pivotLock.RLock()
|
|
|
|
if d.pivotHeader != nil {
|
|
|
|
pivot = d.pivotHeader.Number.Uint64()
|
|
|
|
}
|
|
|
|
d.pivotLock.RUnlock()
|
2020-09-08 11:13:16 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
if pivoting {
|
|
|
|
if len(headers) == 2 {
|
|
|
|
if have, want := headers[0].Number.Uint64(), pivot+uint64(fsMinFullBlocks); have != want {
|
|
|
|
log.Warn("Peer sent invalid next pivot", "have", have, "want", want)
|
|
|
|
return fmt.Errorf("%w: next pivot number %d != requested %d", errInvalidChain, have, want)
|
|
|
|
}
|
|
|
|
if have, want := headers[1].Number.Uint64(), pivot+2*uint64(fsMinFullBlocks)-8; have != want {
|
|
|
|
log.Warn("Peer sent invalid pivot confirmer", "have", have, "want", want)
|
|
|
|
return fmt.Errorf("%w: next pivot confirmer number %d != requested %d", errInvalidChain, have, want)
|
|
|
|
}
|
|
|
|
log.Warn("Pivot seemingly stale, moving", "old", pivot, "new", headers[0].Number)
|
|
|
|
pivot = headers[0].Number.Uint64()
|
2020-09-08 11:13:16 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
d.pivotLock.Lock()
|
|
|
|
d.pivotHeader = headers[0]
|
|
|
|
d.pivotLock.Unlock()
|
2020-09-08 11:13:16 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// Write out the pivot into the database so a rollback beyond
|
|
|
|
// it will reenable snap sync and update the state root that
|
|
|
|
// the state syncer will be downloading.
|
|
|
|
rawdb.WriteLastPivotNumber(d.stateDB, pivot)
|
2020-09-08 11:13:16 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// Disable the pivot check and fetch the next batch of headers
|
|
|
|
pivoting = false
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// If the skeleton's finished, pull any remaining head headers directly from the origin
|
|
|
|
if skeleton && len(headers) == 0 {
|
|
|
|
// A malicious node might withhold advertised headers indefinitely
|
|
|
|
if from+uint64(MaxHeaderFetch)-1 <= head {
|
|
|
|
p.log.Warn("Peer withheld skeleton headers", "advertised", head, "withheld", from+uint64(MaxHeaderFetch)-1)
|
|
|
|
return fmt.Errorf("%w: withheld skeleton headers: advertised %d, withheld #%d", errStallingPeer, head, from+uint64(MaxHeaderFetch)-1)
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
p.log.Debug("No skeleton, fetching headers directly")
|
|
|
|
skeleton = false
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// If no more headers are inbound, notify the content fetchers and return
|
|
|
|
if len(headers) == 0 {
|
|
|
|
// Don't abort header fetches while the pivot is downloading
|
|
|
|
if atomic.LoadInt32(&d.committed) == 0 && pivot <= from {
|
|
|
|
p.log.Debug("No headers, waiting for pivot commit")
|
2016-06-02 12:37:14 +03:00
|
|
|
select {
|
2021-11-26 14:26:03 +03:00
|
|
|
case <-time.After(fsHeaderContCheck):
|
|
|
|
continue
|
2016-06-02 12:37:14 +03:00
|
|
|
case <-d.cancelCh:
|
2019-06-05 15:00:46 +03:00
|
|
|
return errCanceled
|
2016-06-02 12:37:14 +03:00
|
|
|
}
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// Pivot done (or not in snap sync) and no more headers, terminate the process
|
|
|
|
p.log.Debug("No more headers available")
|
|
|
|
select {
|
|
|
|
case d.headerProcCh <- nil:
|
|
|
|
return nil
|
|
|
|
case <-d.cancelCh:
|
|
|
|
return errCanceled
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
}
|
|
|
|
// If we received a skeleton batch, resolve internals concurrently
|
|
|
|
var progressed bool
|
|
|
|
if skeleton {
|
|
|
|
filled, proced, err := d.fillHeaderSkeleton(from, headers)
|
|
|
|
if err != nil {
|
|
|
|
p.log.Debug("Skeleton chain invalid", "err", err)
|
|
|
|
return fmt.Errorf("%w: %v", errInvalidChain, err)
|
|
|
|
}
|
|
|
|
headers = filled[proced:]
|
|
|
|
progressed = proced > 0
|
|
|
|
from += uint64(proced)
|
|
|
|
} else {
|
|
|
|
// A malicious node might withhold advertised headers indefinitely
|
|
|
|
if n := len(headers); n < MaxHeaderFetch && headers[n-1].Number.Uint64() < head {
|
|
|
|
p.log.Warn("Peer withheld headers", "advertised", head, "delivered", headers[n-1].Number.Uint64())
|
|
|
|
return fmt.Errorf("%w: withheld headers: advertised %d, delivered %d", errStallingPeer, head, headers[n-1].Number.Uint64())
|
|
|
|
}
|
|
|
|
// If we're closing in on the chain head, but haven't yet reached it, delay
|
|
|
|
// the last few headers so mini reorgs on the head don't cause invalid hash
|
|
|
|
// chain errors.
|
|
|
|
if n := len(headers); n > 0 {
|
|
|
|
// Retrieve the current head we're at
|
|
|
|
var head uint64
|
|
|
|
if mode == LightSync {
|
|
|
|
head = d.lightchain.CurrentHeader().Number.Uint64()
|
2020-09-08 11:13:16 +03:00
|
|
|
} else {
|
2021-11-26 14:26:03 +03:00
|
|
|
head = d.blockchain.CurrentFastBlock().NumberU64()
|
|
|
|
if full := d.blockchain.CurrentBlock().NumberU64(); head < full {
|
|
|
|
head = full
|
|
|
|
}
|
2020-09-08 11:13:16 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// If the head is below the common ancestor, we're actually deduplicating
|
|
|
|
// already existing chain segments, so use the ancestor as the fake head.
|
|
|
|
// Otherwise, we might end up delaying header deliveries pointlessly.
|
|
|
|
if head < ancestor {
|
|
|
|
head = ancestor
|
2018-10-04 16:36:59 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// If the head is way older than this batch, delay the last few headers
|
|
|
|
if head+uint64(reorgProtThreshold) < headers[n-1].Number.Uint64() {
|
|
|
|
delay := reorgProtHeaderDelay
|
|
|
|
if delay > n {
|
|
|
|
delay = n
|
|
|
|
}
|
|
|
|
headers = headers[:n-delay]
|
2015-09-28 19:27:31 +03:00
|
|
|
}
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
}
|
|
|
|
// If no headers have bene delivered, or all of them have been delayed,
|
|
|
|
// sleep a bit and retry. Take care with headers already consumed during
|
|
|
|
// skeleton filling
|
|
|
|
if len(headers) == 0 && !progressed {
|
|
|
|
p.log.Trace("All headers delayed, waiting")
|
2016-02-25 18:36:42 +02:00
|
|
|
select {
|
2021-11-26 14:26:03 +03:00
|
|
|
case <-time.After(fsHeaderContCheck):
|
|
|
|
continue
|
2016-02-25 18:36:42 +02:00
|
|
|
case <-d.cancelCh:
|
2021-11-26 14:26:03 +03:00
|
|
|
return errCanceled
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
}
|
|
|
|
// Insert any remaining new headers and fetch the next batch
|
|
|
|
if len(headers) > 0 {
|
|
|
|
p.log.Trace("Scheduling new headers", "count", len(headers), "from", from)
|
|
|
|
select {
|
|
|
|
case d.headerProcCh <- headers:
|
|
|
|
case <-d.cancelCh:
|
|
|
|
return errCanceled
|
|
|
|
}
|
|
|
|
from += uint64(len(headers))
|
|
|
|
}
|
|
|
|
// If we're still skeleton filling snap sync, check pivot staleness
|
|
|
|
// before continuing to the next skeleton filling
|
|
|
|
if skeleton && pivot > 0 {
|
|
|
|
pivoting = true
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
// fillHeaderSkeleton concurrently retrieves headers from all our available peers
|
|
|
|
// and maps them to the provided skeleton header chain.
|
2016-04-19 12:27:37 +03:00
|
|
|
//
|
|
|
|
// Any partial results from the beginning of the skeleton is (if possible) forwarded
|
|
|
|
// immediately to the header processor to keep the rest of the pipeline full even
|
|
|
|
// in the case of header stalls.
|
|
|
|
//
|
2018-04-04 13:25:02 +03:00
|
|
|
// The method returns the entire filled skeleton and also the number of headers
|
2016-04-19 12:27:37 +03:00
|
|
|
// already forwarded for processing.
|
|
|
|
func (d *Downloader) fillHeaderSkeleton(from uint64, skeleton []*types.Header) ([]*types.Header, int, error) {
|
2017-02-24 19:23:03 +03:00
|
|
|
log.Debug("Filling up skeleton", "from", from)
|
2016-02-25 18:36:42 +02:00
|
|
|
d.queue.ScheduleSkeleton(from, skeleton)
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
err := d.concurrentFetch((*headerQueue)(d))
|
|
|
|
if err != nil {
|
|
|
|
log.Debug("Skeleton fill failed", "err", err)
|
|
|
|
}
|
2016-04-19 12:27:37 +03:00
|
|
|
filled, proced := d.queue.RetrieveHeaders()
|
2021-11-26 14:26:03 +03:00
|
|
|
if err == nil {
|
|
|
|
log.Debug("Skeleton fill succeeded", "filled", len(filled), "processed", proced)
|
|
|
|
}
|
2016-04-19 12:27:37 +03:00
|
|
|
return filled, proced, err
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
|
|
|
|
2015-08-14 21:25:41 +03:00
|
|
|
// fetchBodies iteratively downloads the scheduled block bodies, taking any
|
|
|
|
// available peers, reserving a chunk of blocks for each, waiting for delivery
|
|
|
|
// and also periodically checking for timeouts.
|
|
|
|
func (d *Downloader) fetchBodies(from uint64) error {
|
2017-02-24 19:23:03 +03:00
|
|
|
log.Debug("Downloading block bodies", "origin", from)
|
2021-11-26 14:26:03 +03:00
|
|
|
err := d.concurrentFetch((*bodyQueue)(d))
|
2015-09-28 19:27:31 +03:00
|
|
|
|
2017-02-27 14:17:58 +03:00
|
|
|
log.Debug("Block body download terminated", "err", err)
|
2015-09-28 19:27:31 +03:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// fetchReceipts iteratively downloads the scheduled block receipts, taking any
|
|
|
|
// available peers, reserving a chunk of receipts for each, waiting for delivery
|
|
|
|
// and also periodically checking for timeouts.
|
|
|
|
func (d *Downloader) fetchReceipts(from uint64) error {
|
2021-11-26 14:26:03 +03:00
|
|
|
log.Debug("Downloading receipts", "origin", from)
|
|
|
|
err := d.concurrentFetch((*receiptQueue)(d))
|
2015-09-28 19:27:31 +03:00
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
log.Debug("Receipt download terminated", "err", err)
|
2015-09-28 19:27:31 +03:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
// processHeaders takes batches of retrieved headers from an input channel and
|
|
|
|
// keeps processing and scheduling them into the header chain and downloader's
|
|
|
|
// queue until the stream ends or a failure occurs.
|
2020-09-08 11:13:16 +03:00
|
|
|
func (d *Downloader) processHeaders(origin uint64, td *big.Int) error {
|
2016-02-25 18:36:42 +02:00
|
|
|
// Keep a count of uncertain headers to roll back
|
2020-07-24 10:46:26 +03:00
|
|
|
var (
|
2020-08-20 13:01:24 +03:00
|
|
|
rollback uint64 // Zero means no rollback (fine as you can't unroll the genesis)
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr error
|
|
|
|
mode = d.getMode()
|
|
|
|
)
|
2016-02-25 18:36:42 +02:00
|
|
|
defer func() {
|
2020-08-20 13:01:24 +03:00
|
|
|
if rollback > 0 {
|
2017-06-27 18:15:29 +03:00
|
|
|
lastHeader, lastFastBlock, lastBlock := d.lightchain.CurrentHeader().Number, common.Big0, common.Big0
|
2020-06-30 20:43:29 +03:00
|
|
|
if mode != LightSync {
|
2017-07-03 17:17:12 +03:00
|
|
|
lastFastBlock = d.blockchain.CurrentFastBlock().Number()
|
|
|
|
lastBlock = d.blockchain.CurrentBlock().Number()
|
2016-01-13 20:35:48 +02:00
|
|
|
}
|
2020-08-20 13:01:24 +03:00
|
|
|
if err := d.lightchain.SetHead(rollback - 1); err != nil { // -1 to target the parent of the first uncertain block
|
|
|
|
// We're already unwinding the stack, only print the error to make it more visible
|
|
|
|
log.Error("Failed to roll back chain segment", "head", rollback-1, "err", err)
|
|
|
|
}
|
2016-01-13 20:35:48 +02:00
|
|
|
curFastBlock, curBlock := common.Big0, common.Big0
|
2020-06-30 20:43:29 +03:00
|
|
|
if mode != LightSync {
|
2017-07-03 17:17:12 +03:00
|
|
|
curFastBlock = d.blockchain.CurrentFastBlock().Number()
|
|
|
|
curBlock = d.blockchain.CurrentBlock().Number()
|
2016-01-13 20:35:48 +02:00
|
|
|
}
|
2020-08-20 13:01:24 +03:00
|
|
|
log.Warn("Rolled back chain segment",
|
2017-06-27 18:15:29 +03:00
|
|
|
"header", fmt.Sprintf("%d->%d", lastHeader, d.lightchain.CurrentHeader().Number),
|
2021-11-26 14:26:03 +03:00
|
|
|
"snap", fmt.Sprintf("%d->%d", lastFastBlock, curFastBlock),
|
2020-07-24 10:46:26 +03:00
|
|
|
"block", fmt.Sprintf("%d->%d", lastBlock, curBlock), "reason", rollbackErr)
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
// Wait for batches of headers to process
|
|
|
|
gotHeaders := false
|
|
|
|
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-d.cancelCh:
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr = errCanceled
|
2019-06-05 15:00:46 +03:00
|
|
|
return errCanceled
|
2016-02-25 18:36:42 +02:00
|
|
|
|
|
|
|
case headers := <-d.headerProcCh:
|
|
|
|
// Terminate header processing if we synced up
|
|
|
|
if len(headers) == 0 {
|
|
|
|
// Notify everyone that headers are fully processed
|
2021-11-26 14:26:03 +03:00
|
|
|
for _, ch := range []chan bool{d.queue.blockWakeCh, d.queue.receiptWakeCh} {
|
2016-02-25 18:36:42 +02:00
|
|
|
select {
|
|
|
|
case ch <- false:
|
|
|
|
case <-d.cancelCh:
|
|
|
|
}
|
|
|
|
}
|
2017-11-16 14:14:51 +03:00
|
|
|
// If no headers were retrieved at all, the peer violated its TD promise that it had a
|
|
|
|
// better chain compared to ours. The only exception is if its promised blocks were
|
2018-10-23 14:21:16 +03:00
|
|
|
// already imported by other means (e.g. fetcher):
|
2016-02-25 18:36:42 +02:00
|
|
|
//
|
|
|
|
// R <remote peer>, L <local node>: Both at block 10
|
|
|
|
// R: Mine block 11, and propagate it to L
|
|
|
|
// L: Queue block 11 for import
|
|
|
|
// L: Notice that R's head and TD increased compared to ours, start sync
|
|
|
|
// L: Import of block 11 finishes
|
|
|
|
// L: Sync begins, and finds common ancestor at 11
|
|
|
|
// L: Request new headers up from 11 (R's TD was higher, it must have something)
|
|
|
|
// R: Nothing to give
|
2020-06-30 20:43:29 +03:00
|
|
|
if mode != LightSync {
|
2018-01-30 19:39:32 +03:00
|
|
|
head := d.blockchain.CurrentBlock()
|
|
|
|
if !gotHeaders && td.Cmp(d.blockchain.GetTd(head.Hash(), head.NumberU64())) > 0 {
|
2016-01-13 20:35:48 +02:00
|
|
|
return errStallingPeer
|
|
|
|
}
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// If snap or light syncing, ensure promised headers are indeed delivered. This is
|
2016-02-25 18:36:42 +02:00
|
|
|
// needed to detect scenarios where an attacker feeds a bad pivot and then bails out
|
|
|
|
// of delivering the post-pivot blocks that would flag the invalid content.
|
|
|
|
//
|
|
|
|
// This check cannot be executed "as is" for full imports, since blocks may still be
|
|
|
|
// queued for processing when the header download completes. However, as long as the
|
|
|
|
// peer gave us something useful, we're already happy/progressed (above check).
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync || mode == LightSync {
|
2018-01-30 19:39:32 +03:00
|
|
|
head := d.lightchain.CurrentHeader()
|
|
|
|
if td.Cmp(d.lightchain.GetTd(head.Hash(), head.Number.Uint64())) > 0 {
|
2016-02-25 18:36:42 +02:00
|
|
|
return errStallingPeer
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Disable any rollback and return
|
2020-08-20 13:01:24 +03:00
|
|
|
rollback = 0
|
2016-02-25 18:36:42 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// Otherwise split the chunk of headers into batches and process them
|
|
|
|
gotHeaders = true
|
|
|
|
for len(headers) > 0 {
|
|
|
|
// Terminate if something failed in between processing chunks
|
|
|
|
select {
|
|
|
|
case <-d.cancelCh:
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr = errCanceled
|
2019-06-05 15:00:46 +03:00
|
|
|
return errCanceled
|
2016-02-25 18:36:42 +02:00
|
|
|
default:
|
|
|
|
}
|
|
|
|
// Select the next chunk of headers to import
|
|
|
|
limit := maxHeadersProcess
|
|
|
|
if limit > len(headers) {
|
|
|
|
limit = len(headers)
|
|
|
|
}
|
|
|
|
chunk := headers[:limit]
|
2020-08-20 13:01:24 +03:00
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
// In case of header only syncing, validate the chunk immediately
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync || mode == LightSync {
|
2016-02-25 18:36:42 +02:00
|
|
|
// If we're importing pure headers, verify based on their recentness
|
2020-09-08 11:13:16 +03:00
|
|
|
var pivot uint64
|
|
|
|
|
|
|
|
d.pivotLock.RLock()
|
|
|
|
if d.pivotHeader != nil {
|
|
|
|
pivot = d.pivotHeader.Number.Uint64()
|
|
|
|
}
|
|
|
|
d.pivotLock.RUnlock()
|
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
frequency := fsHeaderCheckFrequency
|
|
|
|
if chunk[len(chunk)-1].Number.Uint64()+uint64(fsHeaderForceVerify) > pivot {
|
|
|
|
frequency = 1
|
|
|
|
}
|
2017-06-28 15:25:08 +03:00
|
|
|
if n, err := d.lightchain.InsertHeaderChain(chunk, frequency); err != nil {
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr = err
|
2020-08-20 13:01:24 +03:00
|
|
|
|
|
|
|
// If some headers were inserted, track them as uncertain
|
2021-11-26 14:26:03 +03:00
|
|
|
if (mode == SnapSync || frequency > 1) && n > 0 && rollback == 0 {
|
2020-08-20 13:01:24 +03:00
|
|
|
rollback = chunk[0].Number.Uint64()
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2020-09-09 14:06:32 +03:00
|
|
|
log.Warn("Invalid header encountered", "number", chunk[n].Number, "hash", chunk[n].Hash(), "parent", chunk[n].ParentHash, "err", err)
|
2020-05-29 12:12:43 +03:00
|
|
|
return fmt.Errorf("%w: %v", errInvalidChain, err)
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
2020-08-20 13:01:24 +03:00
|
|
|
// All verifications passed, track all headers within the alloted limits
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == SnapSync {
|
2020-09-09 14:06:32 +03:00
|
|
|
head := chunk[len(chunk)-1].Number.Uint64()
|
|
|
|
if head-rollback > uint64(fsHeaderSafetyNet) {
|
|
|
|
rollback = head - uint64(fsHeaderSafetyNet)
|
|
|
|
} else {
|
|
|
|
rollback = 1
|
|
|
|
}
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// Unless we're doing light chains, schedule the headers for associated content retrieval
|
2021-11-26 14:26:03 +03:00
|
|
|
if mode == FullSync || mode == SnapSync {
|
2016-02-25 18:36:42 +02:00
|
|
|
// If we've reached the allowed number of pending headers, stall a bit
|
2021-11-26 14:26:03 +03:00
|
|
|
for d.queue.PendingBodies() >= maxQueuedHeaders || d.queue.PendingReceipts() >= maxQueuedHeaders {
|
2016-02-25 18:36:42 +02:00
|
|
|
select {
|
|
|
|
case <-d.cancelCh:
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr = errCanceled
|
2019-06-05 15:00:46 +03:00
|
|
|
return errCanceled
|
2016-02-25 18:36:42 +02:00
|
|
|
case <-time.After(time.Second):
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Otherwise insert the headers for content retrieval
|
|
|
|
inserts := d.queue.Schedule(chunk, origin)
|
|
|
|
if len(inserts) != len(chunk) {
|
2020-07-24 10:46:26 +03:00
|
|
|
rollbackErr = fmt.Errorf("stale headers: len inserts %v len(chunk) %v", len(inserts), len(chunk))
|
2020-07-09 00:08:08 +03:00
|
|
|
return fmt.Errorf("%w: stale headers", errBadPeer)
|
2016-02-25 18:36:42 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
headers = headers[limit:]
|
|
|
|
origin += uint64(limit)
|
|
|
|
}
|
2018-03-09 12:51:30 +03:00
|
|
|
// Update the highest block number we know if a higher one is found.
|
|
|
|
d.syncStatsLock.Lock()
|
|
|
|
if d.syncStatsChainHeight < origin {
|
|
|
|
d.syncStatsChainHeight = origin - 1
|
|
|
|
}
|
|
|
|
d.syncStatsLock.Unlock()
|
|
|
|
|
2016-02-25 18:36:42 +02:00
|
|
|
// Signal the content downloaders of the availablility of new tasks
|
2021-11-26 14:26:03 +03:00
|
|
|
for _, ch := range []chan bool{d.queue.blockWakeCh, d.queue.receiptWakeCh} {
|
2016-02-25 18:36:42 +02:00
|
|
|
select {
|
|
|
|
case ch <- true:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
// processFullSyncContent takes fetch results from the queue and imports them into the chain.
|
|
|
|
func (d *Downloader) processFullSyncContent() error {
|
2015-06-12 13:35:29 +03:00
|
|
|
for {
|
2018-02-05 19:40:32 +03:00
|
|
|
results := d.queue.Results(true)
|
2015-09-28 19:27:31 +03:00
|
|
|
if len(results) == 0 {
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
return nil
|
2015-06-12 13:35:29 +03:00
|
|
|
}
|
2015-08-14 21:25:41 +03:00
|
|
|
if d.chainInsertHook != nil {
|
2015-09-28 19:27:31 +03:00
|
|
|
d.chainInsertHook(results)
|
2015-08-14 21:25:41 +03:00
|
|
|
}
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
if err := d.importBlockResults(results); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Downloader) importBlockResults(results []*fetchResult) error {
|
2018-02-05 19:40:32 +03:00
|
|
|
// Check for any early termination requests
|
|
|
|
if len(results) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
select {
|
|
|
|
case <-d.quitCh:
|
|
|
|
return errCancelContentProcessing
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
// Retrieve the a batch of results to import
|
|
|
|
first, last := results[0].Header, results[len(results)-1].Header
|
|
|
|
log.Debug("Inserting downloaded chain", "items", len(results),
|
|
|
|
"firstnum", first.Number, "firsthash", first.Hash(),
|
|
|
|
"lastnum", last.Number, "lasthash", last.Hash(),
|
|
|
|
)
|
|
|
|
blocks := make([]*types.Block, len(results))
|
|
|
|
for i, result := range results {
|
|
|
|
blocks[i] = types.NewBlockWithHeader(result.Header).WithBody(result.Transactions, result.Uncles)
|
|
|
|
}
|
all: core rework for the merge transition (#23761)
* all: work for eth1/2 transtition
* consensus/beacon, eth: change beacon difficulty to 0
* eth: updates
* all: add terminalBlockDifficulty config, fix rebasing issues
* eth: implemented merge interop spec
* internal/ethapi: update to v1.0.0.alpha.2
This commit updates the code to the new spec, moving payloadId into
it's own object. It also fixes an issue with finalizing an empty blockhash.
It also properly sets the basefee
* all: sync polishes, other fixes + refactors
* core, eth: correct semantics for LeavePoW, EnterPoS
* core: fixed rebasing artifacts
* core: light: performance improvements
* core: use keyed field (f)
* core: eth: fix compilation issues + tests
* eth/catalyst: dbetter error codes
* all: move Merger to consensus/, remove reliance on it in bc
* all: renamed EnterPoS and LeavePoW to ReachTDD and FinalizePoS
* core: make mergelogs a function
* core: use InsertChain instead of InsertBlock
* les: drop merger from lightchain object
* consensus: add merger
* core: recoverAncestors in catalyst mode
* core: fix nitpick
* all: removed merger from beacon, use TTD, nitpicks
* consensus: eth: add docstring, removed unnecessary code duplication
* consensus/beacon: better comment
* all: easy to fix nitpicks by karalabe
* consensus/beacon: verify known headers to be sure
* core: comments
* core: eth: don't drop peers who advertise blocks, nitpicks
* core: never add beacon blocks to the future queue
* core: fixed nitpicks
* consensus/beacon: simplify IsTTDReached check
* consensus/beacon: correct IsTTDReached check
Co-authored-by: rjl493456442 <garyrong0905@gmail.com>
Co-authored-by: Péter Szilágyi <peterke@gmail.com>
2021-11-26 14:23:02 +03:00
|
|
|
// Downloaded blocks are always regarded as trusted after the
|
|
|
|
// transition. Because the downloaded chain is guided by the
|
|
|
|
// consensus-layer.
|
2018-02-05 19:40:32 +03:00
|
|
|
if index, err := d.blockchain.InsertChain(blocks); err != nil {
|
2018-12-20 12:46:08 +03:00
|
|
|
if index < len(results) {
|
|
|
|
log.Debug("Downloaded item processing failed", "number", results[index].Header.Number, "hash", results[index].Header.Hash(), "err", err)
|
|
|
|
} else {
|
|
|
|
// The InsertChain method in blockchain.go will sometimes return an out-of-bounds index,
|
|
|
|
// when it needs to preprocess blocks to import a sidechain.
|
|
|
|
// The importer will put together a new list of blocks to import, which is a superset
|
|
|
|
// of the blocks delivered from the downloader, and the indexing will be off.
|
|
|
|
log.Debug("Downloaded item processing failed on sidechain import", "index", index, "err", err)
|
|
|
|
}
|
2020-05-29 12:12:43 +03:00
|
|
|
return fmt.Errorf("%w: %v", errInvalidChain, err)
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
// processSnapSyncContent takes fetch results from the queue and writes them to the
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
// database. It also controls the synchronisation of state nodes of the pivot block.
|
2021-11-26 14:26:03 +03:00
|
|
|
func (d *Downloader) processSnapSyncContent() error {
|
2018-02-05 19:40:32 +03:00
|
|
|
// Start syncing state of the reported head block. This should get us most of
|
|
|
|
// the state of the pivot block.
|
2020-09-08 11:13:16 +03:00
|
|
|
d.pivotLock.RLock()
|
|
|
|
sync := d.syncState(d.pivotHeader.Root)
|
|
|
|
d.pivotLock.RUnlock()
|
|
|
|
|
2020-08-26 13:05:06 +03:00
|
|
|
defer func() {
|
|
|
|
// The `sync` object is replaced every time the pivot moves. We need to
|
|
|
|
// defer close the very last active one, hence the lazy evaluation vs.
|
|
|
|
// calling defer sync.Cancel() !!!
|
|
|
|
sync.Cancel()
|
|
|
|
}()
|
|
|
|
|
2019-10-25 14:17:32 +03:00
|
|
|
closeOnErr := func(s *stateSync) {
|
2021-02-16 17:11:33 +03:00
|
|
|
if err := s.Wait(); err != nil && err != errCancelStateFetch && err != errCanceled && err != snap.ErrCancelled {
|
2018-10-23 14:21:16 +03:00
|
|
|
d.queue.Close() // wake up Results
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
2019-10-25 14:17:32 +03:00
|
|
|
}
|
|
|
|
go closeOnErr(sync)
|
2020-08-20 13:01:24 +03:00
|
|
|
|
2018-02-05 19:40:32 +03:00
|
|
|
// To cater for moving pivot points, track the pivot block and subsequently
|
2018-04-04 13:25:02 +03:00
|
|
|
// accumulated download results separately.
|
2018-02-05 19:40:32 +03:00
|
|
|
var (
|
|
|
|
oldPivot *fetchResult // Locked in pivot block, might change eventually
|
|
|
|
oldTail []*fetchResult // Downloaded content after the pivot
|
|
|
|
)
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
for {
|
2018-02-05 19:40:32 +03:00
|
|
|
// Wait for the next batch of downloaded data to be available, and if the pivot
|
|
|
|
// block became stale, move the goalpost
|
|
|
|
results := d.queue.Results(oldPivot == nil) // Block if we're not monitoring pivot staleness
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
if len(results) == 0 {
|
2018-02-05 19:40:32 +03:00
|
|
|
// If pivot sync is done, stop
|
|
|
|
if oldPivot == nil {
|
2019-10-25 14:17:32 +03:00
|
|
|
return sync.Cancel()
|
2018-02-05 19:40:32 +03:00
|
|
|
}
|
|
|
|
// If sync failed, stop
|
|
|
|
select {
|
|
|
|
case <-d.cancelCh:
|
2019-10-25 14:17:32 +03:00
|
|
|
sync.Cancel()
|
2019-06-05 15:00:46 +03:00
|
|
|
return errCanceled
|
2018-02-05 19:40:32 +03:00
|
|
|
default:
|
|
|
|
}
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
|
|
|
if d.chainInsertHook != nil {
|
|
|
|
d.chainInsertHook(results)
|
|
|
|
}
|
2020-09-08 11:13:16 +03:00
|
|
|
// If we haven't downloaded the pivot block yet, check pivot staleness
|
|
|
|
// notifications from the header downloader
|
|
|
|
d.pivotLock.RLock()
|
|
|
|
pivot := d.pivotHeader
|
|
|
|
d.pivotLock.RUnlock()
|
|
|
|
|
|
|
|
if oldPivot == nil {
|
|
|
|
if pivot.Root != sync.root {
|
|
|
|
sync.Cancel()
|
|
|
|
sync = d.syncState(pivot.Root)
|
|
|
|
|
|
|
|
go closeOnErr(sync)
|
|
|
|
}
|
|
|
|
} else {
|
2018-02-05 19:40:32 +03:00
|
|
|
results = append(append([]*fetchResult{oldPivot}, oldTail...), results...)
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
// Split around the pivot block and process the two sides via snap/full sync
|
2018-02-05 19:40:32 +03:00
|
|
|
if atomic.LoadInt32(&d.committed) == 0 {
|
2020-09-08 11:13:16 +03:00
|
|
|
latest := results[len(results)-1].Header
|
|
|
|
// If the height is above the pivot block by 2 sets, it means the pivot
|
|
|
|
// become stale in the network and it was garbage collected, move to a
|
|
|
|
// new pivot.
|
|
|
|
//
|
|
|
|
// Note, we have `reorgProtHeaderDelay` number of blocks withheld, Those
|
|
|
|
// need to be taken into account, otherwise we're detecting the pivot move
|
|
|
|
// late and will drop peers due to unavailable state!!!
|
|
|
|
if height := latest.Number.Uint64(); height >= pivot.Number.Uint64()+2*uint64(fsMinFullBlocks)-uint64(reorgProtHeaderDelay) {
|
|
|
|
log.Warn("Pivot became stale, moving", "old", pivot.Number.Uint64(), "new", height-uint64(fsMinFullBlocks)+uint64(reorgProtHeaderDelay))
|
|
|
|
pivot = results[len(results)-1-fsMinFullBlocks+reorgProtHeaderDelay].Header // must exist as lower old pivot is uncommitted
|
|
|
|
|
|
|
|
d.pivotLock.Lock()
|
|
|
|
d.pivotHeader = pivot
|
|
|
|
d.pivotLock.Unlock()
|
2020-08-20 13:01:24 +03:00
|
|
|
|
|
|
|
// Write out the pivot into the database so a rollback beyond it will
|
2021-11-26 14:26:03 +03:00
|
|
|
// reenable snap sync
|
2020-09-08 11:13:16 +03:00
|
|
|
rawdb.WriteLastPivotNumber(d.stateDB, pivot.Number.Uint64())
|
2018-02-05 19:40:32 +03:00
|
|
|
}
|
|
|
|
}
|
2020-09-08 11:13:16 +03:00
|
|
|
P, beforeP, afterP := splitAroundPivot(pivot.Number.Uint64(), results)
|
2021-11-26 14:26:03 +03:00
|
|
|
if err := d.commitSnapSyncData(beforeP, sync); err != nil {
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
if P != nil {
|
2018-02-05 19:40:32 +03:00
|
|
|
// If new pivot block found, cancel old state retrieval and restart
|
|
|
|
if oldPivot != P {
|
2019-10-25 14:17:32 +03:00
|
|
|
sync.Cancel()
|
|
|
|
sync = d.syncState(P.Header.Root)
|
2020-08-26 13:05:06 +03:00
|
|
|
|
2019-10-25 14:17:32 +03:00
|
|
|
go closeOnErr(sync)
|
2018-02-05 19:40:32 +03:00
|
|
|
oldPivot = P
|
|
|
|
}
|
|
|
|
// Wait for completion, occasionally checking for pivot staleness
|
|
|
|
select {
|
2019-10-25 14:17:32 +03:00
|
|
|
case <-sync.done:
|
|
|
|
if sync.err != nil {
|
|
|
|
return sync.err
|
2018-02-05 19:40:32 +03:00
|
|
|
}
|
|
|
|
if err := d.commitPivotBlock(P); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
oldPivot = nil
|
|
|
|
|
|
|
|
case <-time.After(time.Second):
|
|
|
|
oldTail = afterP
|
|
|
|
continue
|
2015-06-12 13:35:29 +03:00
|
|
|
}
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
// Fast sync done, pivot commit done, full import
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
if err := d.importBlockResults(afterP); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func splitAroundPivot(pivot uint64, results []*fetchResult) (p *fetchResult, before, after []*fetchResult) {
|
2020-07-24 10:46:26 +03:00
|
|
|
if len(results) == 0 {
|
|
|
|
return nil, nil, nil
|
|
|
|
}
|
|
|
|
if lastNum := results[len(results)-1].Header.Number.Uint64(); lastNum < pivot {
|
|
|
|
// the pivot is somewhere in the future
|
|
|
|
return nil, results, nil
|
|
|
|
}
|
|
|
|
// This can also be optimized, but only happens very seldom
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
for _, result := range results {
|
|
|
|
num := result.Header.Number.Uint64()
|
|
|
|
switch {
|
|
|
|
case num < pivot:
|
|
|
|
before = append(before, result)
|
|
|
|
case num == pivot:
|
|
|
|
p = result
|
|
|
|
default:
|
|
|
|
after = append(after, result)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return p, before, after
|
|
|
|
}
|
|
|
|
|
2021-11-26 14:26:03 +03:00
|
|
|
func (d *Downloader) commitSnapSyncData(results []*fetchResult, stateSync *stateSync) error {
|
2018-02-05 19:40:32 +03:00
|
|
|
// Check for any early termination requests
|
|
|
|
if len(results) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
select {
|
|
|
|
case <-d.quitCh:
|
|
|
|
return errCancelContentProcessing
|
|
|
|
case <-stateSync.done:
|
|
|
|
if err := stateSync.Wait(); err != nil {
|
|
|
|
return err
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
default:
|
|
|
|
}
|
|
|
|
// Retrieve the a batch of results to import
|
|
|
|
first, last := results[0].Header, results[len(results)-1].Header
|
2021-11-26 14:26:03 +03:00
|
|
|
log.Debug("Inserting snap-sync blocks", "items", len(results),
|
2018-02-05 19:40:32 +03:00
|
|
|
"firstnum", first.Number, "firsthash", first.Hash(),
|
|
|
|
"lastnumn", last.Number, "lasthash", last.Hash(),
|
|
|
|
)
|
|
|
|
blocks := make([]*types.Block, len(results))
|
|
|
|
receipts := make([]types.Receipts, len(results))
|
|
|
|
for i, result := range results {
|
|
|
|
blocks[i] = types.NewBlockWithHeader(result.Header).WithBody(result.Transactions, result.Uncles)
|
|
|
|
receipts[i] = result.Receipts
|
|
|
|
}
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
if index, err := d.blockchain.InsertReceiptChain(blocks, receipts, d.ancientLimit); err != nil {
|
2018-02-05 19:40:32 +03:00
|
|
|
log.Debug("Downloaded item processing failed", "number", results[index].Header.Number, "hash", results[index].Header.Hash(), "err", err)
|
2020-05-29 12:12:43 +03:00
|
|
|
return fmt.Errorf("%w: %v", errInvalidChain, err)
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (d *Downloader) commitPivotBlock(result *fetchResult) error {
|
2018-02-05 19:40:32 +03:00
|
|
|
block := types.NewBlockWithHeader(result.Header).WithBody(result.Transactions, result.Uncles)
|
2021-11-26 14:26:03 +03:00
|
|
|
log.Debug("Committing snap sync pivot as new head", "number", block.Number(), "hash", block.Hash())
|
2019-05-13 15:28:01 +03:00
|
|
|
|
|
|
|
// Commit the pivot block as the new head, will require full sync from here on
|
all: integrate the freezer with fast sync
* all: freezer style syncing
core, eth, les, light: clean up freezer relative APIs
core, eth, les, trie, ethdb, light: clean a bit
core, eth, les, light: add unit tests
core, light: rewrite setHead function
core, eth: fix downloader unit tests
core: add receipt chain insertion test
core: use constant instead of hardcoding table name
core: fix rollback
core: fix setHead
core/rawdb: remove canonical block first and then iterate side chain
core/rawdb, ethdb: add hasAncient interface
eth/downloader: calculate ancient limit via cht first
core, eth, ethdb: lots of fixes
* eth/downloader: print ancient disable log only for fast sync
2019-04-25 17:59:48 +03:00
|
|
|
if _, err := d.blockchain.InsertReceiptChain([]*types.Block{block}, []types.Receipts{result.Receipts}, d.ancientLimit); err != nil {
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
return err
|
|
|
|
}
|
2021-11-26 14:26:03 +03:00
|
|
|
if err := d.blockchain.SnapSyncCommitHead(block.Hash()); err != nil {
|
eth/downloader: separate state sync from queue (#14460)
* eth/downloader: separate state sync from queue
Scheduling of state node downloads hogged the downloader queue lock when
new requests were scheduled. This caused timeouts for other requests.
With this change, state sync is fully independent of all other downloads
and doesn't involve the queue at all.
State sync is started and checked on in processContent. This is slightly
awkward because processContent doesn't have a select loop. Instead, the
queue is closed by an auxiliary goroutine when state sync fails. We
tried several alternatives to this but settled on the current approach
because it's the least amount of change overall.
Handling of the pivot block has changed slightly: the queue previously
prevented import of pivot block receipts before the state of the pivot
block was available. In this commit, the receipt will be imported before
the state. This causes an annoyance where the pivot block is committed
as fast block head even when state downloads fail. Stay tuned for more
updates in this area ;)
* eth/downloader: remove cancelTimeout channel
* eth/downloader: retry state requests on timeout
* eth/downloader: improve comment
* eth/downloader: mark peers idle when state sync is done
* eth/downloader: move pivot block splitting to processContent
This change also ensures that pivot block receipts aren't imported
before the pivot block itself.
* eth/downloader: limit state node retries
* eth/downloader: improve state node error handling and retry check
* eth/downloader: remove maxStateNodeRetries
It fails the sync too much.
* eth/downloader: remove last use of cancelCh in statesync.go
Fixes TestDeliverHeadersHang*Fast and (hopefully)
the weird cancellation behaviour at the end of fast sync.
* eth/downloader: fix leak in runStateSync
* eth/downloader: don't run processFullSyncContent in LightSync mode
* eth/downloader: improve comments
* eth/downloader: fix vet, megacheck
* eth/downloader: remove unrequested tasks anyway
* eth/downloader, trie: various polishes around duplicate items
This commit explicitly tracks duplicate and unexpected state
delieveries done against a trie Sync structure, also adding
there to import info logs.
The commit moves the db batch used to commit trie changes one
level deeper so its flushed after every node insertion. This
is needed to avoid a lot of duplicate retrievals caused by
inconsistencies between Sync internals and database. A better
approach is to track not-yet-written states in trie.Sync and
flush on commit, but I'm focuing on correctness first now.
The commit fixes a regression around pivot block fail count.
The counter previously was reset to 1 if and only if a sync
cycle progressed (inserted at least 1 entry to the database).
The current code reset it already if a node was delivered,
which is not stong enough, because unless it ends up written
to disk, an attacker can just loop and attack ad infinitum.
The commit also fixes a regression around state deliveries
and timeouts. The old downloader tracked if a delivery is
stale (none of the deliveries were requestedt), in which
case it didn't mark the node idle and did not send further
requests, since it signals a past timeout. The current code
did mark it idle even on stale deliveries, which eventually
caused two requests to be in flight at the same time, making
the deliveries always stale and mass duplicating retrievals
between multiple peers.
* eth/downloader: fix state request leak
This commit fixes the hang seen sometimes while doing the state
sync. The cause of the hang was a rare combination of events:
request state data from peer, peer drops and reconnects almost
immediately. This caused a new download task to be assigned to
the peer, overwriting the old one still waiting for a timeout,
which in turned leaked the requests out, never to be retried.
The fix is to ensure that a task assignment moves any pending
one back into the retry queue.
The commit also fixes a regression with peer dropping due to
stalls. The current code considered a peer stalling if they
timed out delivering 1 item. However, the downloader never
requests only one, the minimum is 2 (attempt to fine tune
estimated latency/bandwidth). The fix is simply to drop if
a timeout is detected at 2 items.
Apart from the above bugfixes, the commit contains some code
polishes I made while debugging the hang.
* core, eth, trie: support batched trie sync db writes
* trie: rename SyncMemCache to syncMemBatch
2017-06-22 15:26:03 +03:00
|
|
|
return err
|
2015-06-12 13:35:29 +03:00
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
atomic.StoreInt32(&d.committed, 1)
|
2019-05-13 15:28:01 +03:00
|
|
|
|
|
|
|
// If we had a bloom filter for the state sync, deallocate it now. Note, we only
|
|
|
|
// deallocate internally, but keep the empty wrapper. This ensures that if we do
|
2021-11-26 14:26:03 +03:00
|
|
|
// a rollback after committing the pivot and restarting snap sync, we don't end
|
2019-05-13 15:28:01 +03:00
|
|
|
// up using a nil bloom. Empty bloom is fine, it just returns that it does not
|
|
|
|
// have the info we need, so reach down to the database instead.
|
|
|
|
if d.stateBloom != nil {
|
|
|
|
d.stateBloom.Close()
|
|
|
|
}
|
2018-02-05 19:40:32 +03:00
|
|
|
return nil
|
2015-06-12 13:35:29 +03:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:27:15 +03:00
|
|
|
// DeliverSnapPacket is invoked from a peer's message handler when it transmits a
|
|
|
|
// data packet for the local node to consume.
|
|
|
|
func (d *Downloader) DeliverSnapPacket(peer *snap.Peer, packet snap.Packet) error {
|
|
|
|
switch packet := packet.(type) {
|
|
|
|
case *snap.AccountRangePacket:
|
|
|
|
hashes, accounts, err := packet.Unpack()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return d.SnapSyncer.OnAccounts(peer, packet.ID, hashes, accounts, packet.Proof)
|
|
|
|
|
|
|
|
case *snap.StorageRangesPacket:
|
|
|
|
hashset, slotset := packet.Unpack()
|
|
|
|
return d.SnapSyncer.OnStorage(peer, packet.ID, hashset, slotset, packet.Proof)
|
|
|
|
|
|
|
|
case *snap.ByteCodesPacket:
|
|
|
|
return d.SnapSyncer.OnByteCodes(peer, packet.ID, packet.Codes)
|
|
|
|
|
|
|
|
case *snap.TrieNodesPacket:
|
|
|
|
return d.SnapSyncer.OnTrieNodes(peer, packet.ID, packet.Nodes)
|
|
|
|
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("unexpected snap packet type: %T", packet)
|
|
|
|
}
|
2015-10-05 19:37:56 +03:00
|
|
|
}
|