p2p/dnsdisc: re-check tree root when leaf resolution fails (#20682)
This adds additional logic to re-resolve the root name of a tree when a couple of leaf requests have failed. We need this change to avoid getting into a failure state where leaf requests keep failing for half an hour when the tree has been updated.
This commit is contained in:
parent
c2117982b8
commit
57d4898e29
@ -19,6 +19,7 @@ package dnsdisc
|
||||
import (
|
||||
"context"
|
||||
"crypto/ecdsa"
|
||||
"errors"
|
||||
"math/rand"
|
||||
"reflect"
|
||||
"testing"
|
||||
@ -176,11 +177,62 @@ func TestIteratorNodeUpdates(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// sync the original tree.
|
||||
// Sync the original tree.
|
||||
resolver.add(tree1.ToTXT("n"))
|
||||
checkIterator(t, it, nodes[:25])
|
||||
|
||||
// Update some nodes and ensure RandomNode returns the new nodes as well.
|
||||
// Ensure RandomNode returns the new nodes after the tree is updated.
|
||||
updateSomeNodes(nodesSeed1, nodes)
|
||||
tree2, _ := makeTestTree("n", nodes, nil)
|
||||
resolver.clear()
|
||||
resolver.add(tree2.ToTXT("n"))
|
||||
t.Log("tree updated")
|
||||
|
||||
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
|
||||
checkIterator(t, it, nodes)
|
||||
}
|
||||
|
||||
// This test checks that the tree root is rechecked when a couple of leaf
|
||||
// requests have failed. The test is just like TestIteratorNodeUpdates, but
|
||||
// without advancing the clock by recheckInterval after the tree update.
|
||||
func TestIteratorRootRecheckOnFail(t *testing.T) {
|
||||
var (
|
||||
clock = new(mclock.Simulated)
|
||||
nodes = testNodes(nodesSeed1, 30)
|
||||
resolver = newMapResolver()
|
||||
c = NewClient(Config{
|
||||
Resolver: resolver,
|
||||
Logger: testlog.Logger(t, log.LvlTrace),
|
||||
RecheckInterval: 20 * time.Minute,
|
||||
RateLimit: 500,
|
||||
// Disabling the cache is required for this test because the client doesn't
|
||||
// notice leaf failures if all records are cached.
|
||||
CacheLimit: 1,
|
||||
})
|
||||
)
|
||||
c.clock = clock
|
||||
tree1, url := makeTestTree("n", nodes[:25], nil)
|
||||
it, err := c.NewIterator(url)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Sync the original tree.
|
||||
resolver.add(tree1.ToTXT("n"))
|
||||
checkIterator(t, it, nodes[:25])
|
||||
|
||||
// Ensure RandomNode returns the new nodes after the tree is updated.
|
||||
updateSomeNodes(nodesSeed1, nodes)
|
||||
tree2, _ := makeTestTree("n", nodes, nil)
|
||||
resolver.clear()
|
||||
resolver.add(tree2.ToTXT("n"))
|
||||
t.Log("tree updated")
|
||||
|
||||
checkIterator(t, it, nodes)
|
||||
}
|
||||
|
||||
// updateSomeNodes applies ENR updates to some of the given nodes.
|
||||
func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
|
||||
keys := testKeys(nodesSeed1, len(nodes))
|
||||
for i, n := range nodes[:len(nodes)/2] {
|
||||
r := n.Record()
|
||||
@ -190,11 +242,6 @@ func TestIteratorNodeUpdates(t *testing.T) {
|
||||
n2, _ := enode.New(enode.ValidSchemes, r)
|
||||
nodes[i] = n2
|
||||
}
|
||||
tree2, _ := makeTestTree("n", nodes, nil)
|
||||
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
|
||||
resolver.clear()
|
||||
resolver.add(tree2.ToTXT("n"))
|
||||
checkIterator(t, it, nodes)
|
||||
}
|
||||
|
||||
// This test verifies that randomIterator re-checks the root of the tree to catch
|
||||
@ -230,9 +277,10 @@ func TestIteratorLinkUpdates(t *testing.T) {
|
||||
// Add link to tree3, remove link to tree2.
|
||||
tree1, _ = makeTestTree("t1", nodes[:10], []string{url3})
|
||||
resolver.add(tree1.ToTXT("t1"))
|
||||
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
|
||||
t.Log("tree1 updated")
|
||||
|
||||
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
|
||||
|
||||
var wantNodes []*enode.Node
|
||||
wantNodes = append(wantNodes, tree1.Nodes()...)
|
||||
wantNodes = append(wantNodes, tree3.Nodes()...)
|
||||
@ -345,5 +393,5 @@ func (mr mapResolver) LookupTXT(ctx context.Context, name string) ([]string, err
|
||||
if record, ok := mr[name]; ok {
|
||||
return []string{record}, nil
|
||||
}
|
||||
return nil, nil
|
||||
return nil, errors.New("not found")
|
||||
}
|
||||
|
@ -25,15 +25,22 @@ import (
|
||||
"github.com/ethereum/go-ethereum/p2p/enode"
|
||||
)
|
||||
|
||||
const (
|
||||
rootRecheckFailCount = 5 // update root if this many leaf requests fail
|
||||
)
|
||||
|
||||
// clientTree is a full tree being synced.
|
||||
type clientTree struct {
|
||||
c *Client
|
||||
loc *linkEntry // link to this tree
|
||||
|
||||
lastRootCheck mclock.AbsTime // last revalidation of root
|
||||
root *rootEntry
|
||||
enrs *subtreeSync
|
||||
links *subtreeSync
|
||||
leafFailCount int
|
||||
rootFailCount int
|
||||
|
||||
root *rootEntry
|
||||
enrs *subtreeSync
|
||||
links *subtreeSync
|
||||
|
||||
lc *linkCache // tracks all links between all trees
|
||||
curLinks map[string]struct{} // links contained in this tree
|
||||
@ -46,7 +53,7 @@ func newClientTree(c *Client, lc *linkCache, loc *linkEntry) *clientTree {
|
||||
|
||||
// syncAll retrieves all entries of the tree.
|
||||
func (ct *clientTree) syncAll(dest map[string]entry) error {
|
||||
if err := ct.updateRoot(); err != nil {
|
||||
if err := ct.updateRoot(context.Background()); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ct.links.resolveAll(dest); err != nil {
|
||||
@ -60,12 +67,20 @@ func (ct *clientTree) syncAll(dest map[string]entry) error {
|
||||
|
||||
// syncRandom retrieves a single entry of the tree. The Node return value
|
||||
// is non-nil if the entry was a node.
|
||||
func (ct *clientTree) syncRandom(ctx context.Context) (*enode.Node, error) {
|
||||
func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) {
|
||||
if ct.rootUpdateDue() {
|
||||
if err := ct.updateRoot(); err != nil {
|
||||
if err := ct.updateRoot(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Update fail counter for leaf request errors.
|
||||
defer func() {
|
||||
if err != nil {
|
||||
ct.leafFailCount++
|
||||
}
|
||||
}()
|
||||
|
||||
// Link tree sync has priority, run it to completion before syncing ENRs.
|
||||
if !ct.links.done() {
|
||||
err := ct.syncNextLink(ctx)
|
||||
@ -138,15 +153,22 @@ func removeHash(h []string, index int) []string {
|
||||
}
|
||||
|
||||
// updateRoot ensures that the given tree has an up-to-date root.
|
||||
func (ct *clientTree) updateRoot() error {
|
||||
func (ct *clientTree) updateRoot(ctx context.Context) error {
|
||||
if !ct.slowdownRootUpdate(ctx) {
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
ct.lastRootCheck = ct.c.clock.Now()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), ct.c.cfg.Timeout)
|
||||
ctx, cancel := context.WithTimeout(ctx, ct.c.cfg.Timeout)
|
||||
defer cancel()
|
||||
root, err := ct.c.resolveRoot(ctx, ct.loc)
|
||||
if err != nil {
|
||||
ct.rootFailCount++
|
||||
return err
|
||||
}
|
||||
ct.root = &root
|
||||
ct.rootFailCount = 0
|
||||
ct.leafFailCount = 0
|
||||
|
||||
// Invalidate subtrees if changed.
|
||||
if ct.links == nil || root.lroot != ct.links.root {
|
||||
@ -161,7 +183,32 @@ func (ct *clientTree) updateRoot() error {
|
||||
|
||||
// rootUpdateDue returns true when a root update is needed.
|
||||
func (ct *clientTree) rootUpdateDue() bool {
|
||||
return ct.root == nil || time.Duration(ct.c.clock.Now()-ct.lastRootCheck) > ct.c.cfg.RecheckInterval
|
||||
tooManyFailures := ct.leafFailCount > rootRecheckFailCount
|
||||
scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
|
||||
return ct.root == nil || tooManyFailures || scheduledCheck
|
||||
}
|
||||
|
||||
// slowdownRootUpdate applies a delay to root resolution if is tried
|
||||
// too frequently. This avoids busy polling when the client is offline.
|
||||
// Returns true if the timeout passed, false if sync was canceled.
|
||||
func (ct *clientTree) slowdownRootUpdate(ctx context.Context) bool {
|
||||
var delay time.Duration
|
||||
switch {
|
||||
case ct.rootFailCount > 20:
|
||||
delay = 10 * time.Second
|
||||
case ct.rootFailCount > 5:
|
||||
delay = 5 * time.Second
|
||||
default:
|
||||
return true
|
||||
}
|
||||
timeout := ct.c.clock.NewTimer(delay)
|
||||
defer timeout.Stop()
|
||||
select {
|
||||
case <-timeout.C():
|
||||
return true
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// subtreeSync is the sync of an ENR or link subtree.
|
||||
|
Loading…
Reference in New Issue
Block a user