swarm/pss: Fix flaky TestProxNetwork (#19471)

This commit is contained in:
gluk256 2019-04-19 11:15:17 +02:00 committed by Anton Evangelatov
parent d8dc37c85b
commit d9403690ec
3 changed files with 188 additions and 194 deletions

@ -234,9 +234,9 @@ func (s *Simulation) UploadSnapshot(ctx context.Context, snapshotFile string, op
if err != nil { if err != nil {
return err return err
} }
defer f.Close()
jsonbyte, err := ioutil.ReadAll(f) jsonbyte, err := ioutil.ReadAll(f)
f.Close()
if err != nil { if err != nil {
return err return err
} }

@ -4,10 +4,7 @@ import (
"context" "context"
"crypto/ecdsa" "crypto/ecdsa"
"encoding/binary" "encoding/binary"
"errors"
"fmt" "fmt"
"strconv"
"strings"
"sync" "sync"
"testing" "testing"
"time" "time"
@ -39,24 +36,20 @@ type handlerNotification struct {
} }
type testData struct { type testData struct {
mu sync.Mutex sim *simulation.Simulation
sim *simulation.Simulation kademlias map[enode.ID]*network.Kademlia
handlerDone bool // set to true on termination of the simulation run nodeAddresses map[enode.ID][]byte // make predictable overlay addresses from the generated random enode ids
requiredMessages int senders map[int]enode.ID // originating nodes of the messages (intention is to choose as far as possible from the receiving neighborhood)
allowedMessages int recipientAddresses [][]byte
messageCount int
kademlias map[enode.ID]*network.Kademlia requiredMsgCount int
nodeAddrs map[enode.ID][]byte // make predictable overlay addresses from the generated random enode ids requiredMsgs map[enode.ID][]uint64 // message serials we expect respective nodes to receive
recipients map[int][]enode.ID // for logging output only allowedMsgs map[enode.ID][]uint64 // message serials we expect respective nodes to receive
allowed map[int][]enode.ID // allowed recipients
expectedMsgs map[enode.ID][]uint64 // message serials we expect respective nodes to receive notifications []handlerNotification // notification queue
allowedMsgs map[enode.ID][]uint64 // message serials we expect respective nodes to receive totalMsgCount int
senders map[int]enode.ID // originating nodes of the messages (intention is to choose as far as possible from the receiving neighborhood) handlerDone bool // set to true on termination of the simulation run
handlerC chan handlerNotification // passes message from pss message handler to simulation driver mu sync.Mutex
doneC chan struct{} // terminates the handler channel listener
errC chan error // error to pass to main sim thread
msgC chan handlerNotification // message receipt notification to main sim thread
msgs [][]byte // recipient addresses of messages
} }
var ( var (
@ -64,67 +57,60 @@ var (
topic = BytesToTopic([]byte{0xf3, 0x9e, 0x06, 0x82}) topic = BytesToTopic([]byte{0xf3, 0x9e, 0x06, 0x82})
) )
func (d *testData) getMsgCount() int { func (td *testData) pushNotification(val handlerNotification) {
d.mu.Lock() td.mu.Lock()
defer d.mu.Unlock() td.notifications = append(td.notifications, val)
return d.messageCount td.mu.Unlock()
} }
func (d *testData) incrementMsgCount() int { func (td *testData) popNotification() (first handlerNotification, exist bool) {
d.mu.Lock() td.mu.Lock()
defer d.mu.Unlock() if len(td.notifications) > 0 {
d.messageCount++ exist = true
return d.messageCount first = td.notifications[0]
} td.notifications = td.notifications[1:]
func (d *testData) isDone() bool {
d.mu.Lock()
defer d.mu.Unlock()
return d.handlerDone
}
func (d *testData) setDone() {
d.mu.Lock()
defer d.mu.Unlock()
d.handlerDone = true
}
func getCmdParams(t *testing.T) (int, int, time.Duration) {
args := strings.Split(t.Name(), "/")
msgCount, err := strconv.ParseInt(args[2], 10, 16)
if err != nil {
t.Fatal(err)
} }
nodeCount, err := strconv.ParseInt(args[1], 10, 16) td.mu.Unlock()
if err != nil { return first, exist
t.Fatal(err) }
}
timeoutStr := fmt.Sprintf("%ss", args[3]) func (td *testData) getMsgCount() int {
timeoutDur, err := time.ParseDuration(timeoutStr) td.mu.Lock()
if err != nil { defer td.mu.Unlock()
t.Fatal(err) return td.totalMsgCount
} }
return int(msgCount), int(nodeCount), timeoutDur
func (td *testData) incrementMsgCount() int {
td.mu.Lock()
defer td.mu.Unlock()
td.totalMsgCount++
return td.totalMsgCount
}
func (td *testData) isDone() bool {
td.mu.Lock()
defer td.mu.Unlock()
return td.handlerDone
}
func (td *testData) setDone() {
td.mu.Lock()
defer td.mu.Unlock()
td.handlerDone = true
} }
func newTestData() *testData { func newTestData() *testData {
return &testData{ return &testData{
kademlias: make(map[enode.ID]*network.Kademlia), kademlias: make(map[enode.ID]*network.Kademlia),
nodeAddrs: make(map[enode.ID][]byte), nodeAddresses: make(map[enode.ID][]byte),
recipients: make(map[int][]enode.ID), requiredMsgs: make(map[enode.ID][]uint64),
allowed: make(map[int][]enode.ID), allowedMsgs: make(map[enode.ID][]uint64),
expectedMsgs: make(map[enode.ID][]uint64), senders: make(map[int]enode.ID),
allowedMsgs: make(map[enode.ID][]uint64),
senders: make(map[int]enode.ID),
handlerC: make(chan handlerNotification),
doneC: make(chan struct{}),
errC: make(chan error),
msgC: make(chan handlerNotification),
} }
} }
func (d *testData) getKademlia(nodeId *enode.ID) (*network.Kademlia, error) { func (td *testData) getKademlia(nodeId *enode.ID) (*network.Kademlia, error) {
kadif, ok := d.sim.NodeItem(*nodeId, simulation.BucketKeyKademlia) kadif, ok := td.sim.NodeItem(*nodeId, simulation.BucketKeyKademlia)
if !ok { if !ok {
return nil, fmt.Errorf("no kademlia entry for %v", nodeId) return nil, fmt.Errorf("no kademlia entry for %v", nodeId)
} }
@ -135,29 +121,29 @@ func (d *testData) getKademlia(nodeId *enode.ID) (*network.Kademlia, error) {
return kad, nil return kad, nil
} }
func (d *testData) init(msgCount int) error { func (td *testData) init(msgCount int) error {
log.Debug("TestProxNetwork start") log.Debug("TestProxNetwork start")
for _, nodeId := range d.sim.NodeIDs() { for _, nodeId := range td.sim.NodeIDs() {
kad, err := d.getKademlia(&nodeId) kad, err := td.getKademlia(&nodeId)
if err != nil { if err != nil {
return err return err
} }
d.nodeAddrs[nodeId] = kad.BaseAddr() td.nodeAddresses[nodeId] = kad.BaseAddr()
} }
for i := 0; i < int(msgCount); i++ { for i := 0; i < int(msgCount); i++ {
msgAddr := pot.RandomAddress() // we choose message addresses randomly msgAddr := pot.RandomAddress() // we choose message addresses randomly
d.msgs = append(d.msgs, msgAddr.Bytes()) td.recipientAddresses = append(td.recipientAddresses, msgAddr.Bytes())
smallestPo := 256 smallestPo := 256
var targets []enode.ID var targets []enode.ID
var closestPO int var closestPO int
// loop through all nodes and find the required and allowed recipients of each message // loop through all nodes and find the required and allowed recipients of each message
// (for more information, please see the comment to the main test function) // (for more information, please see the comment to the main test function)
for _, nod := range d.sim.Net.GetNodes() { for _, nod := range td.sim.Net.GetNodes() {
po, _ := pof(d.msgs[i], d.nodeAddrs[nod.ID()], 0) po, _ := pof(td.recipientAddresses[i], td.nodeAddresses[nod.ID()], 0)
depth := d.kademlias[nod.ID()].NeighbourhoodDepth() depth := td.kademlias[nod.ID()].NeighbourhoodDepth()
// only nodes with closest IDs (wrt the msg address) will be required recipients // only nodes with closest IDs (wrt the msg address) will be required recipients
if po > closestPO { if po > closestPO {
@ -169,28 +155,25 @@ func (d *testData) init(msgCount int) error {
} }
if po >= depth { if po >= depth {
d.allowedMessages++ td.allowedMsgs[nod.ID()] = append(td.allowedMsgs[nod.ID()], uint64(i))
d.allowed[i] = append(d.allowed[i], nod.ID())
d.allowedMsgs[nod.ID()] = append(d.allowedMsgs[nod.ID()], uint64(i))
} }
// a node with the smallest PO (wrt msg) will be the sender, // a node with the smallest PO (wrt msg) will be the sender,
// in order to increase the distance the msg must travel // in order to increase the distance the msg must travel
if po < smallestPo { if po < smallestPo {
smallestPo = po smallestPo = po
d.senders[i] = nod.ID() td.senders[i] = nod.ID()
} }
} }
d.requiredMessages += len(targets) td.requiredMsgCount += len(targets)
for _, id := range targets { for _, id := range targets {
d.recipients[i] = append(d.recipients[i], id) td.requiredMsgs[id] = append(td.requiredMsgs[id], uint64(i))
d.expectedMsgs[id] = append(d.expectedMsgs[id], uint64(i))
} }
log.Debug("nn for msg", "targets", len(d.recipients[i]), "msgidx", i, "msg", common.Bytes2Hex(msgAddr[:8]), "sender", d.senders[i], "senderpo", smallestPo) log.Debug("nn for msg", "targets", len(targets), "msgidx", i, "msg", common.Bytes2Hex(msgAddr[:8]), "sender", td.senders[i], "senderpo", smallestPo)
} }
log.Debug("msgs to receive", "count", d.requiredMessages) log.Debug("recipientAddresses to receive", "count", td.requiredMsgCount)
return nil return nil
} }
@ -213,144 +196,161 @@ func (d *testData) init(msgCount int) error {
// nodes Y and Z will be considered required recipients of the msg, // nodes Y and Z will be considered required recipients of the msg,
// whereas nodes X, Y and Z will be allowed recipients. // whereas nodes X, Y and Z will be allowed recipients.
func TestProxNetwork(t *testing.T) { func TestProxNetwork(t *testing.T) {
t.Run("16/16/15", testProxNetwork) t.Run("16_nodes,_16_messages,_16_seconds", func(t *testing.T) {
testProxNetwork(t, 16, 16, 16*time.Second)
})
} }
// params in run name: nodes/msgs
func TestProxNetworkLong(t *testing.T) { func TestProxNetworkLong(t *testing.T) {
if !*longrunning { if !*longrunning {
t.Skip("run with --longrunning flag to run extensive network tests") t.Skip("run with --longrunning flag to run extensive network tests")
} }
t.Run("8/100/30", testProxNetwork) t.Run("8_nodes,_100_messages,_30_seconds", func(t *testing.T) {
t.Run("16/100/30", testProxNetwork) testProxNetwork(t, 8, 100, 30*time.Second)
t.Run("32/100/60", testProxNetwork) })
t.Run("64/100/60", testProxNetwork) t.Run("16_nodes,_100_messages,_30_seconds", func(t *testing.T) {
t.Run("128/100/120", testProxNetwork) testProxNetwork(t, 16, 100, 30*time.Second)
})
t.Run("32_nodes,_100_messages,_60_seconds", func(t *testing.T) {
testProxNetwork(t, 32, 100, 1*time.Minute)
})
t.Run("64_nodes,_100_messages,_60_seconds", func(t *testing.T) {
testProxNetwork(t, 64, 100, 1*time.Minute)
})
t.Run("128_nodes,_100_messages,_120_seconds", func(t *testing.T) {
testProxNetwork(t, 128, 100, 2*time.Minute)
})
} }
func testProxNetwork(t *testing.T) { func testProxNetwork(t *testing.T, nodeCount int, msgCount int, timeout time.Duration) {
tstdata := newTestData() td := newTestData()
msgCount, nodeCount, timeout := getCmdParams(t)
handlerContextFuncs := make(map[Topic]handlerContextFunc) handlerContextFuncs := make(map[Topic]handlerContextFunc)
handlerContextFuncs[topic] = nodeMsgHandler handlerContextFuncs[topic] = nodeMsgHandler
services := newProxServices(tstdata, true, handlerContextFuncs, tstdata.kademlias) services := newProxServices(td, true, handlerContextFuncs, td.kademlias)
tstdata.sim = simulation.New(services) td.sim = simulation.New(services)
defer tstdata.sim.Close() defer td.sim.Close()
ctx, cancel := context.WithTimeout(context.Background(), timeout) ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel() defer cancel()
filename := fmt.Sprintf("testdata/snapshot_%d.json", nodeCount) filename := fmt.Sprintf("testdata/snapshot_%d.json", nodeCount)
err := tstdata.sim.UploadSnapshot(ctx, filename) err := td.sim.UploadSnapshot(ctx, filename)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
err = tstdata.init(msgCount) // initialize the test data err = td.init(msgCount) // initialize the test data
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
wrapper := func(c context.Context, _ *simulation.Simulation) error { wrapper := func(c context.Context, _ *simulation.Simulation) error {
return testRoutine(tstdata, c) return testRoutine(td, c)
} }
result := tstdata.sim.Run(ctx, wrapper) // call the main test function result := td.sim.Run(ctx, wrapper) // call the main test function
if result.Error != nil { if result.Error != nil {
// context deadline exceeded timedOut := result.Error == context.DeadlineExceeded
// however, it might just mean that not all possible messages are received if !timedOut || td.getMsgCount() < td.requiredMsgCount {
// now we must check if all required messages are received
cnt := tstdata.getMsgCount()
log.Debug("TestProxNetwork finished", "rcv", cnt)
if cnt < tstdata.requiredMessages {
t.Fatal(result.Error) t.Fatal(result.Error)
} }
} }
t.Logf("completed %d", result.Duration)
} }
func (tstdata *testData) sendAllMsgs() { func (td *testData) sendAllMsgs() error {
for i, msg := range tstdata.msgs { nodes := make(map[int]*rpc.Client)
log.Debug("sending msg", "idx", i, "from", tstdata.senders[i]) for i := range td.recipientAddresses {
nodeClient, err := tstdata.sim.Net.GetNode(tstdata.senders[i]).Client() nodeClient, err := td.sim.Net.GetNode(td.senders[i]).Client()
if err != nil { if err != nil {
tstdata.errC <- err return err
} }
nodes[i] = nodeClient
}
for i, msg := range td.recipientAddresses {
log.Debug("sending msg", "idx", i, "from", td.senders[i])
nodeClient := nodes[i]
var uvarByte [8]byte var uvarByte [8]byte
binary.PutUvarint(uvarByte[:], uint64(i)) binary.PutUvarint(uvarByte[:], uint64(i))
nodeClient.Call(nil, "pss_sendRaw", hexutil.Encode(msg), hexutil.Encode(topic[:]), hexutil.Encode(uvarByte[:])) nodeClient.Call(nil, "pss_sendRaw", hexutil.Encode(msg), hexutil.Encode(topic[:]), hexutil.Encode(uvarByte[:]))
} }
log.Debug("all messages sent") return nil
}
func isMoreTimeLeft(ctx context.Context) bool {
select {
case <-ctx.Done():
return false
default:
return true
}
} }
// testRoutine is the main test function, called by Simulation.Run() // testRoutine is the main test function, called by Simulation.Run()
func testRoutine(tstdata *testData, ctx context.Context) error { func testRoutine(td *testData, ctx context.Context) error {
go handlerChannelListener(tstdata, ctx)
go tstdata.sendAllMsgs()
received := 0
// collect incoming messages and terminate with corresponding status when message handler listener ends hasMoreRound := func(err error, hadMessage bool) bool {
for { return err == nil && (hadMessage || isMoreTimeLeft(ctx))
select { }
case err := <-tstdata.errC:
return err if err := td.sendAllMsgs(); err != nil {
case hn := <-tstdata.msgC: return err
received++ }
log.Debug("msg received", "msgs_received", received, "total_expected", tstdata.requiredMessages, "id", hn.id, "serial", hn.serial)
if received == tstdata.allowedMessages { var err error
close(tstdata.doneC) received := 0
return nil hadMessage := false
}
for oneMoreRound := true; oneMoreRound; oneMoreRound = hasMoreRound(err, hadMessage) {
message, hadMessage := td.popNotification()
if !isMoreTimeLeft(ctx) {
// Stop handlers from sending more messages.
// Note: only best effort, race is possible.
td.setDone()
} }
if hadMessage {
if td.isAllowedMessage(message) {
received++
log.Debug("msg received", "msgs_received", received, "total_expected", td.requiredMsgCount, "id", message.id, "serial", message.serial)
} else {
err = fmt.Errorf("message %d received by wrong recipient %v", message.serial, message.id)
}
} else {
time.Sleep(32 * time.Millisecond)
}
}
if err != nil {
return err
}
if td.getMsgCount() < td.requiredMsgCount {
return ctx.Err()
} }
return nil return nil
} }
func handlerChannelListener(tstdata *testData, ctx context.Context) { func (td *testData) isAllowedMessage(n handlerNotification) bool {
for { // check if message serial is in expected messages for this recipient
select { for _, s := range td.allowedMsgs[n.id] {
case <-tstdata.doneC: // graceful exit if n.serial == s {
tstdata.setDone() return true
tstdata.errC <- nil
return
case <-ctx.Done(): // timeout or cancel
tstdata.setDone()
tstdata.errC <- ctx.Err()
return
// incoming message from pss message handler
case handlerNotification := <-tstdata.handlerC:
// check if recipient has already received all its messages and notify to fail the test if so
aMsgs := tstdata.allowedMsgs[handlerNotification.id]
if len(aMsgs) == 0 {
tstdata.setDone()
tstdata.errC <- fmt.Errorf("too many messages received by recipient %x", handlerNotification.id)
return
}
// check if message serial is in expected messages for this recipient and notify to fail the test if not
idx := -1
for i, msg := range aMsgs {
if handlerNotification.serial == msg {
idx = i
break
}
}
if idx == -1 {
tstdata.setDone()
tstdata.errC <- fmt.Errorf("message %d received by wrong recipient %v", handlerNotification.serial, handlerNotification.id)
return
}
// message is ok, so remove that message serial from the recipient expectation array and notify the main sim thread
aMsgs[idx] = aMsgs[len(aMsgs)-1]
aMsgs = aMsgs[:len(aMsgs)-1]
tstdata.msgC <- handlerNotification
} }
} }
return false
} }
func nodeMsgHandler(tstdata *testData, config *adapters.NodeConfig) *handler { func (td *testData) removeAllowedMessage(id enode.ID, index int) {
last := len(td.allowedMsgs[id]) - 1
td.allowedMsgs[id][index] = td.allowedMsgs[id][last]
td.allowedMsgs[id] = td.allowedMsgs[id][:last]
}
func nodeMsgHandler(td *testData, config *adapters.NodeConfig) *handler {
return &handler{ return &handler{
f: func(msg []byte, p *p2p.Peer, asymmetric bool, keyid string) error { f: func(msg []byte, p *p2p.Peer, asymmetric bool, keyid string) error {
cnt := tstdata.incrementMsgCount() if td.isDone() {
log.Debug("nodeMsgHandler rcv", "cnt", cnt) return nil // terminate if simulation is over
}
td.incrementMsgCount()
// using simple serial in message body, makes it easy to keep track of who's getting what // using simple serial in message body, makes it easy to keep track of who's getting what
serial, c := binary.Uvarint(msg) serial, c := binary.Uvarint(msg)
@ -358,15 +358,7 @@ func nodeMsgHandler(tstdata *testData, config *adapters.NodeConfig) *handler {
log.Crit(fmt.Sprintf("corrupt message received by %x (uvarint parse returned %d)", config.ID, c)) log.Crit(fmt.Sprintf("corrupt message received by %x (uvarint parse returned %d)", config.ID, c))
} }
if tstdata.isDone() { td.pushNotification(handlerNotification{id: config.ID, serial: serial})
return errors.New("handlers aborted") // terminate if simulation is over
}
// pass message context to the listener in the simulation
tstdata.handlerC <- handlerNotification{
id: config.ID,
serial: serial,
}
return nil return nil
}, },
caps: &handlerCaps{ caps: &handlerCaps{
@ -378,7 +370,7 @@ func nodeMsgHandler(tstdata *testData, config *adapters.NodeConfig) *handler {
// an adaptation of the same services setup as in pss_test.go // an adaptation of the same services setup as in pss_test.go
// replaces pss_test.go when those tests are rewritten to the new swarm/network/simulation package // replaces pss_test.go when those tests are rewritten to the new swarm/network/simulation package
func newProxServices(tstdata *testData, allowRaw bool, handlerContextFuncs map[Topic]handlerContextFunc, kademlias map[enode.ID]*network.Kademlia) map[string]simulation.ServiceFunc { func newProxServices(td *testData, allowRaw bool, handlerContextFuncs map[Topic]handlerContextFunc, kademlias map[enode.ID]*network.Kademlia) map[string]simulation.ServiceFunc {
stateStore := state.NewInmemoryStore() stateStore := state.NewInmemoryStore()
kademlia := func(id enode.ID, bzzkey []byte) *network.Kademlia { kademlia := func(id enode.ID, bzzkey []byte) *network.Kademlia {
if k, ok := kademlias[id]; ok { if k, ok := kademlias[id]; ok {
@ -415,6 +407,9 @@ func newProxServices(tstdata *testData, allowRaw bool, handlerContextFuncs map[T
UnderlayAddr: addr.Under(), UnderlayAddr: addr.Under(),
HiveParams: hp, HiveParams: hp,
} }
bzzKey := network.PrivateKeyToBzzKey(bzzPrivateKey)
pskad := kademlia(ctx.Config.ID, bzzKey)
b.Store(simulation.BucketKeyKademlia, pskad)
return network.NewBzz(config, kademlia(ctx.Config.ID, addr.OAddr), stateStore, nil, nil), nil, nil return network.NewBzz(config, kademlia(ctx.Config.ID, addr.OAddr), stateStore, nil, nil), nil, nil
}, },
"pss": func(ctx *adapters.ServiceContext, b *sync.Map) (node.Service, func(), error) { "pss": func(ctx *adapters.ServiceContext, b *sync.Map) (node.Service, func(), error) {
@ -434,6 +429,7 @@ func newProxServices(tstdata *testData, allowRaw bool, handlerContextFuncs map[T
} }
bzzKey := network.PrivateKeyToBzzKey(bzzPrivateKey) bzzKey := network.PrivateKeyToBzzKey(bzzPrivateKey)
pskad := kademlia(ctx.Config.ID, bzzKey) pskad := kademlia(ctx.Config.ID, bzzKey)
b.Store(simulation.BucketKeyKademlia, pskad)
ps, err := NewPss(pskad, pssp) ps, err := NewPss(pskad, pssp)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
@ -442,7 +438,7 @@ func newProxServices(tstdata *testData, allowRaw bool, handlerContextFuncs map[T
// register the handlers we've been passed // register the handlers we've been passed
var deregisters []func() var deregisters []func()
for tpc, hndlrFunc := range handlerContextFuncs { for tpc, hndlrFunc := range handlerContextFuncs {
deregisters = append(deregisters, ps.Register(&tpc, hndlrFunc(tstdata, ctx.Config))) deregisters = append(deregisters, ps.Register(&tpc, hndlrFunc(td, ctx.Config)))
} }
// if handshake mode is set, add the controller // if handshake mode is set, add the controller
@ -459,8 +455,6 @@ func newProxServices(tstdata *testData, allowRaw bool, handlerContextFuncs map[T
Public: false, Public: false,
}) })
b.Store(simulation.BucketKeyKademlia, pskad)
// return Pss and cleanups // return Pss and cleanups
return ps, func() { return ps, func() {
// run the handler deregister functions in reverse order // run the handler deregister functions in reverse order

@ -1364,7 +1364,7 @@ func TestNetwork(t *testing.T) {
} }
// params in run name: // params in run name:
// nodes/msgs/addrbytes/adaptertype // nodes/recipientAddresses/addrbytes/adaptertype
// if adaptertype is exec uses execadapter, simadapter otherwise // if adaptertype is exec uses execadapter, simadapter otherwise
func TestNetwork2000(t *testing.T) { func TestNetwork2000(t *testing.T) {
if !*longrunning { if !*longrunning {