p2p/dnsdisc: re-check tree root when leaf resolution fails (#20682)

This adds additional logic to re-resolve the root name of a tree when a
couple of leaf requests have failed. We need this change to avoid
getting into a failure state where leaf requests keep failing for half
an hour when the tree has been updated.
This commit is contained in:
Felix Lange 2020-02-17 15:23:25 +01:00 committed by GitHub
parent c2117982b8
commit 57d4898e29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 113 additions and 18 deletions

@ -19,6 +19,7 @@ package dnsdisc
import (
"context"
"crypto/ecdsa"
"errors"
"math/rand"
"reflect"
"testing"
@ -176,11 +177,62 @@ func TestIteratorNodeUpdates(t *testing.T) {
t.Fatal(err)
}
// sync the original tree.
// Sync the original tree.
resolver.add(tree1.ToTXT("n"))
checkIterator(t, it, nodes[:25])
// Update some nodes and ensure RandomNode returns the new nodes as well.
// Ensure RandomNode returns the new nodes after the tree is updated.
updateSomeNodes(nodesSeed1, nodes)
tree2, _ := makeTestTree("n", nodes, nil)
resolver.clear()
resolver.add(tree2.ToTXT("n"))
t.Log("tree updated")
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
checkIterator(t, it, nodes)
}
// This test checks that the tree root is rechecked when a couple of leaf
// requests have failed. The test is just like TestIteratorNodeUpdates, but
// without advancing the clock by recheckInterval after the tree update.
func TestIteratorRootRecheckOnFail(t *testing.T) {
var (
clock = new(mclock.Simulated)
nodes = testNodes(nodesSeed1, 30)
resolver = newMapResolver()
c = NewClient(Config{
Resolver: resolver,
Logger: testlog.Logger(t, log.LvlTrace),
RecheckInterval: 20 * time.Minute,
RateLimit: 500,
// Disabling the cache is required for this test because the client doesn't
// notice leaf failures if all records are cached.
CacheLimit: 1,
})
)
c.clock = clock
tree1, url := makeTestTree("n", nodes[:25], nil)
it, err := c.NewIterator(url)
if err != nil {
t.Fatal(err)
}
// Sync the original tree.
resolver.add(tree1.ToTXT("n"))
checkIterator(t, it, nodes[:25])
// Ensure RandomNode returns the new nodes after the tree is updated.
updateSomeNodes(nodesSeed1, nodes)
tree2, _ := makeTestTree("n", nodes, nil)
resolver.clear()
resolver.add(tree2.ToTXT("n"))
t.Log("tree updated")
checkIterator(t, it, nodes)
}
// updateSomeNodes applies ENR updates to some of the given nodes.
func updateSomeNodes(keySeed int64, nodes []*enode.Node) {
keys := testKeys(nodesSeed1, len(nodes))
for i, n := range nodes[:len(nodes)/2] {
r := n.Record()
@ -190,11 +242,6 @@ func TestIteratorNodeUpdates(t *testing.T) {
n2, _ := enode.New(enode.ValidSchemes, r)
nodes[i] = n2
}
tree2, _ := makeTestTree("n", nodes, nil)
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
resolver.clear()
resolver.add(tree2.ToTXT("n"))
checkIterator(t, it, nodes)
}
// This test verifies that randomIterator re-checks the root of the tree to catch
@ -230,9 +277,10 @@ func TestIteratorLinkUpdates(t *testing.T) {
// Add link to tree3, remove link to tree2.
tree1, _ = makeTestTree("t1", nodes[:10], []string{url3})
resolver.add(tree1.ToTXT("t1"))
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
t.Log("tree1 updated")
clock.Run(c.cfg.RecheckInterval + 1*time.Second)
var wantNodes []*enode.Node
wantNodes = append(wantNodes, tree1.Nodes()...)
wantNodes = append(wantNodes, tree3.Nodes()...)
@ -345,5 +393,5 @@ func (mr mapResolver) LookupTXT(ctx context.Context, name string) ([]string, err
if record, ok := mr[name]; ok {
return []string{record}, nil
}
return nil, nil
return nil, errors.New("not found")
}

@ -25,12 +25,19 @@ import (
"github.com/ethereum/go-ethereum/p2p/enode"
)
const (
rootRecheckFailCount = 5 // update root if this many leaf requests fail
)
// clientTree is a full tree being synced.
type clientTree struct {
c *Client
loc *linkEntry // link to this tree
lastRootCheck mclock.AbsTime // last revalidation of root
leafFailCount int
rootFailCount int
root *rootEntry
enrs *subtreeSync
links *subtreeSync
@ -46,7 +53,7 @@ func newClientTree(c *Client, lc *linkCache, loc *linkEntry) *clientTree {
// syncAll retrieves all entries of the tree.
func (ct *clientTree) syncAll(dest map[string]entry) error {
if err := ct.updateRoot(); err != nil {
if err := ct.updateRoot(context.Background()); err != nil {
return err
}
if err := ct.links.resolveAll(dest); err != nil {
@ -60,12 +67,20 @@ func (ct *clientTree) syncAll(dest map[string]entry) error {
// syncRandom retrieves a single entry of the tree. The Node return value
// is non-nil if the entry was a node.
func (ct *clientTree) syncRandom(ctx context.Context) (*enode.Node, error) {
func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) {
if ct.rootUpdateDue() {
if err := ct.updateRoot(); err != nil {
if err := ct.updateRoot(ctx); err != nil {
return nil, err
}
}
// Update fail counter for leaf request errors.
defer func() {
if err != nil {
ct.leafFailCount++
}
}()
// Link tree sync has priority, run it to completion before syncing ENRs.
if !ct.links.done() {
err := ct.syncNextLink(ctx)
@ -138,15 +153,22 @@ func removeHash(h []string, index int) []string {
}
// updateRoot ensures that the given tree has an up-to-date root.
func (ct *clientTree) updateRoot() error {
func (ct *clientTree) updateRoot(ctx context.Context) error {
if !ct.slowdownRootUpdate(ctx) {
return ctx.Err()
}
ct.lastRootCheck = ct.c.clock.Now()
ctx, cancel := context.WithTimeout(context.Background(), ct.c.cfg.Timeout)
ctx, cancel := context.WithTimeout(ctx, ct.c.cfg.Timeout)
defer cancel()
root, err := ct.c.resolveRoot(ctx, ct.loc)
if err != nil {
ct.rootFailCount++
return err
}
ct.root = &root
ct.rootFailCount = 0
ct.leafFailCount = 0
// Invalidate subtrees if changed.
if ct.links == nil || root.lroot != ct.links.root {
@ -161,7 +183,32 @@ func (ct *clientTree) updateRoot() error {
// rootUpdateDue returns true when a root update is needed.
func (ct *clientTree) rootUpdateDue() bool {
return ct.root == nil || time.Duration(ct.c.clock.Now()-ct.lastRootCheck) > ct.c.cfg.RecheckInterval
tooManyFailures := ct.leafFailCount > rootRecheckFailCount
scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval
return ct.root == nil || tooManyFailures || scheduledCheck
}
// slowdownRootUpdate applies a delay to root resolution if is tried
// too frequently. This avoids busy polling when the client is offline.
// Returns true if the timeout passed, false if sync was canceled.
func (ct *clientTree) slowdownRootUpdate(ctx context.Context) bool {
var delay time.Duration
switch {
case ct.rootFailCount > 20:
delay = 10 * time.Second
case ct.rootFailCount > 5:
delay = 5 * time.Second
default:
return true
}
timeout := ct.c.clock.NewTimer(delay)
defer timeout.Stop()
select {
case <-timeout.C():
return true
case <-ctx.Done():
return false
}
}
// subtreeSync is the sync of an ENR or link subtree.