bsc/core/tx_list.go

589 lines
19 KiB
Go
Raw Normal View History

// Copyright 2016 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package core
import (
"container/heap"
"math"
"math/big"
"sort"
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
"sync"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/types"
)
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
var txSortedMapPool = sync.Pool{
New: func() interface{} {
return make(types.Transactions, 0, 10)
},
}
// nonceHeap is a heap.Interface implementation over 64bit unsigned integers for
// retrieving sorted transactions from the possibly gapped future queue.
type nonceHeap []uint64
func (h nonceHeap) Len() int { return len(h) }
func (h nonceHeap) Less(i, j int) bool { return h[i] < h[j] }
func (h nonceHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *nonceHeap) Push(x interface{}) {
*h = append(*h, x.(uint64))
}
func (h *nonceHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// txSortedMap is a nonce->transaction hash map with a heap based index to allow
// iterating over the contents in a nonce-incrementing way.
type txSortedMap struct {
items map[uint64]*types.Transaction // Hash map storing the transaction data
index *nonceHeap // Heap of nonces of all the stored transactions (non-strict mode)
cache types.Transactions // Cache of the transactions already sorted
}
// newTxSortedMap creates a new nonce-sorted transaction map.
func newTxSortedMap() *txSortedMap {
return &txSortedMap{
items: make(map[uint64]*types.Transaction),
index: new(nonceHeap),
}
}
// Get retrieves the current transactions associated with the given nonce.
func (m *txSortedMap) Get(nonce uint64) *types.Transaction {
return m.items[nonce]
}
// Put inserts a new transaction into the map, also updating the map's nonce
// index. If a transaction already exists with the same nonce, it's overwritten.
func (m *txSortedMap) Put(tx *types.Transaction) {
nonce := tx.Nonce()
if m.items[nonce] == nil {
heap.Push(m.index, nonce)
}
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
if m.cache != nil {
txSortedMapPool.Put(m.cache)
}
m.items[nonce], m.cache = tx, nil
}
// Forward removes all transactions from the map with a nonce lower than the
// provided threshold. Every removed transaction is returned for any post-removal
// maintenance.
func (m *txSortedMap) Forward(threshold uint64) types.Transactions {
var removed types.Transactions
// Pop off heap items until the threshold is reached
for m.index.Len() > 0 && (*m.index)[0] < threshold {
nonce := heap.Pop(m.index).(uint64)
removed = append(removed, m.items[nonce])
delete(m.items, nonce)
}
// If we had a cached order, shift the front
if m.cache != nil {
m.cache = m.cache[len(removed):]
}
return removed
}
// Filter iterates over the list of transactions and removes all of them for which
// the specified function evaluates to true.
// Filter, as opposed to 'filter', re-initialises the heap after the operation is done.
// If you want to do several consecutive filterings, it's therefore better to first
// do a .filter(func1) followed by .Filter(func2) or reheap()
func (m *txSortedMap) Filter(filter func(*types.Transaction) bool) types.Transactions {
removed := m.filter(filter)
// If transactions were removed, the heap and cache are ruined
if len(removed) > 0 {
m.reheap()
}
return removed
}
func (m *txSortedMap) reheap() {
*m.index = make([]uint64, 0, len(m.items))
for nonce := range m.items {
*m.index = append(*m.index, nonce)
}
heap.Init(m.index)
m.cache = nil
}
// filter is identical to Filter, but **does not** regenerate the heap. This method
// should only be used if followed immediately by a call to Filter or reheap()
func (m *txSortedMap) filter(filter func(*types.Transaction) bool) types.Transactions {
var removed types.Transactions
// Collect all the transactions to filter out
for nonce, tx := range m.items {
if filter(tx) {
removed = append(removed, tx)
delete(m.items, nonce)
}
}
if len(removed) > 0 {
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
if m.cache != nil {
txSortedMapPool.Put(m.cache)
m.cache = nil
}
}
return removed
}
// Cap places a hard limit on the number of items, returning all transactions
// exceeding that limit.
func (m *txSortedMap) Cap(threshold int) types.Transactions {
// Short circuit if the number of items is under the limit
if len(m.items) <= threshold {
return nil
}
// Otherwise gather and drop the highest nonce'd transactions
var drops types.Transactions
sort.Sort(*m.index)
for size := len(m.items); size > threshold; size-- {
drops = append(drops, m.items[(*m.index)[size-1]])
delete(m.items, (*m.index)[size-1])
}
*m.index = (*m.index)[:threshold]
heap.Init(m.index)
// If we had a cache, shift the back
if m.cache != nil {
m.cache = m.cache[:len(m.cache)-len(drops)]
}
return drops
}
// Remove deletes a transaction from the maintained map, returning whether the
// transaction was found.
func (m *txSortedMap) Remove(nonce uint64) bool {
// Short circuit if no transaction is present
_, ok := m.items[nonce]
if !ok {
return false
}
// Otherwise delete the transaction and fix the heap index
for i := 0; i < m.index.Len(); i++ {
if (*m.index)[i] == nonce {
heap.Remove(m.index, i)
break
}
}
delete(m.items, nonce)
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
if m.cache != nil {
txSortedMapPool.Put(m.cache)
m.cache = nil
}
return true
}
// Ready retrieves a sequentially increasing list of transactions starting at the
// provided nonce that is ready for processing. The returned transactions will be
// removed from the list.
//
// Note, all transactions with nonces lower than start will also be returned to
// prevent getting into and invalid state. This is not something that should ever
// happen but better to be self correcting than failing!
func (m *txSortedMap) Ready(start uint64) types.Transactions {
// Short circuit if no transactions are available
if m.index.Len() == 0 || (*m.index)[0] > start {
return nil
}
// Otherwise start accumulating incremental transactions
var ready types.Transactions
for next := (*m.index)[0]; m.index.Len() > 0 && (*m.index)[0] == next; next++ {
ready = append(ready, m.items[next])
delete(m.items, next)
heap.Pop(m.index)
}
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
if m.cache != nil {
txSortedMapPool.Put(m.cache)
m.cache = nil
}
return ready
}
// Len returns the length of the transaction map.
func (m *txSortedMap) Len() int {
return len(m.items)
}
func (m *txSortedMap) flatten() types.Transactions {
// If the sorting was not cached yet, create and cache it
if m.cache == nil {
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
cache := txSortedMapPool.Get()
if cache != nil {
m.cache = cache.(types.Transactions)
m.cache = m.cache[:0]
} else {
m.cache = make(types.Transactions, 0, len(m.items))
}
for _, tx := range m.items {
m.cache = append(m.cache, tx)
}
sort.Sort(types.TxByNonce(m.cache))
}
return m.cache
}
// Flatten creates a nonce-sorted slice of transactions based on the loosely
// sorted internal representation. The result of the sorting is cached in case
// it's requested again before any modifications are made to the contents.
func (m *txSortedMap) Flatten() types.Transactions {
// Copy the cache to prevent accidental modifications
cache := m.flatten()
txs := make(types.Transactions, len(cache))
copy(txs, cache)
return txs
}
// LastElement returns the last element of a flattened list, thus, the
// transaction with the highest nonce
func (m *txSortedMap) LastElement() *types.Transaction {
cache := m.flatten()
return cache[len(cache)-1]
}
// txList is a "list" of transactions belonging to an account, sorted by account
// nonce. The same type can be used both for storing contiguous transactions for
// the executable/pending queue; and for storing gapped transactions for the non-
2017-01-06 20:44:35 +03:00
// executable/future queue, with minor behavioral changes.
type txList struct {
strict bool // Whether nonces are strictly continuous or not
txs *txSortedMap // Heap indexed sorted hash map of the transactions
costcap *big.Int // Price of the highest costing transaction (reset only if exceeds balance)
gascap uint64 // Gas limit of the highest spending transaction (reset only if exceeds block limit)
}
// newTxList create a new transaction list for maintaining nonce-indexable fast,
// gapped, sortable transaction lists.
func newTxList(strict bool) *txList {
return &txList{
strict: strict,
txs: newTxSortedMap(),
costcap: new(big.Int),
}
}
// Overlaps returns whether the transaction specified has the same nonce as one
// already contained within the list.
func (l *txList) Overlaps(tx *types.Transaction) bool {
return l.txs.Get(tx.Nonce()) != nil
}
// Add tries to insert a new transaction into the list, returning whether the
// transaction was accepted, and if yes, any previous transaction it replaced.
//
// If the new transaction is accepted into the list, the lists' cost and gas
// thresholds are also potentially updated.
func (l *txList) Add(tx *types.Transaction, priceBump uint64) (bool, *types.Transaction) {
// If there's an older better transaction, abort
old := l.txs.Get(tx.Nonce())
if old != nil {
// threshold = oldGP * (100 + priceBump) / 100
a := big.NewInt(100 + int64(priceBump))
a = a.Mul(a, old.GasPrice())
b := big.NewInt(100)
threshold := a.Div(a, b)
// Have to ensure that the new gas price is higher than the old gas
// price as well as checking the percentage threshold to ensure that
// this is accurate for low (Wei-level) gas price replacements
if old.GasPriceCmp(tx) >= 0 || tx.GasPriceIntCmp(threshold) < 0 {
return false, nil
}
}
// Otherwise overwrite the old transaction with the current one
l.txs.Put(tx)
if cost := tx.Cost(); l.costcap.Cmp(cost) < 0 {
l.costcap = cost
}
if gas := tx.Gas(); l.gascap < gas {
l.gascap = gas
}
return true, old
}
// Forward removes all transactions from the list with a nonce lower than the
// provided threshold. Every removed transaction is returned for any post-removal
// maintenance.
func (l *txList) Forward(threshold uint64) types.Transactions {
return l.txs.Forward(threshold)
}
// Filter removes all transactions from the list with a cost or gas limit higher
// than the provided thresholds. Every removed transaction is returned for any
// post-removal maintenance. Strict-mode invalidated transactions are also
// returned.
//
// This method uses the cached costcap and gascap to quickly decide if there's even
// a point in calculating all the costs or if the balance covers all. If the threshold
// is lower than the costgas cap, the caps will be reset to a new high after removing
// the newly invalidated transactions.
func (l *txList) Filter(costLimit *big.Int, gasLimit uint64) (types.Transactions, types.Transactions) {
// If all transactions are below the threshold, short circuit
if l.costcap.Cmp(costLimit) <= 0 && l.gascap <= gasLimit {
return nil, nil
}
l.costcap = new(big.Int).Set(costLimit) // Lower the caps to the thresholds
l.gascap = gasLimit
// Filter out all the transactions above the account's funds
removed := l.txs.Filter(func(tx *types.Transaction) bool {
return tx.Gas() > gasLimit || tx.Cost().Cmp(costLimit) > 0
})
if len(removed) == 0 {
return nil, nil
}
var invalids types.Transactions
// If the list was strict, filter anything above the lowest nonce
if l.strict {
lowest := uint64(math.MaxUint64)
for _, tx := range removed {
if nonce := tx.Nonce(); lowest > nonce {
lowest = nonce
}
}
invalids = l.txs.filter(func(tx *types.Transaction) bool { return tx.Nonce() > lowest })
}
l.txs.reheap()
return removed, invalids
}
// Cap places a hard limit on the number of items, returning all transactions
// exceeding that limit.
func (l *txList) Cap(threshold int) types.Transactions {
return l.txs.Cap(threshold)
}
// Remove deletes a transaction from the maintained list, returning whether the
// transaction was found, and also returning any transaction invalidated due to
// the deletion (strict mode only).
func (l *txList) Remove(tx *types.Transaction) (bool, types.Transactions) {
// Remove the transaction from the set
nonce := tx.Nonce()
if removed := l.txs.Remove(nonce); !removed {
return false, nil
}
// In strict mode, filter out non-executable transactions
if l.strict {
return true, l.txs.Filter(func(tx *types.Transaction) bool { return tx.Nonce() > nonce })
}
return true, nil
}
// Ready retrieves a sequentially increasing list of transactions starting at the
// provided nonce that is ready for processing. The returned transactions will be
// removed from the list.
//
// Note, all transactions with nonces lower than start will also be returned to
// prevent getting into and invalid state. This is not something that should ever
// happen but better to be self correcting than failing!
func (l *txList) Ready(start uint64) types.Transactions {
return l.txs.Ready(start)
}
// Len returns the length of the transaction list.
func (l *txList) Len() int {
[R4R] performance improvement in many aspects (#257) * focus on performance improvement in many aspects. 1. Do BlockBody verification concurrently; 2. Do calculation of intermediate root concurrently; 3. Preload accounts before processing blocks; 4. Make the snapshot layers configurable. 5. Reuse some object to reduce GC. add * rlp: improve decoder stream implementation (#22858) This commit makes various cleanup changes to rlp.Stream. * rlp: shrink Stream struct This removes a lot of unused padding space in Stream by reordering the fields. The size of Stream changes from 120 bytes to 88 bytes. Stream instances are internally cached and reused using sync.Pool, so this does not improve performance. * rlp: simplify list stack The list stack kept track of the size of the current list context as well as the current offset into it. The size had to be stored in the stack in order to subtract it from the remaining bytes of any enclosing list in ListEnd. It seems that this can be implemented in a simpler way: just subtract the size from the enclosing list context in List instead. * rlp: use atomic.Value for type cache (#22902) All encoding/decoding operations read the type cache to find the writer/decoder function responsible for a type. When analyzing CPU profiles of geth during sync, I found that the use of sync.RWMutex in cache lookups appears in the profiles. It seems we are running into CPU cache contention problems when package rlp is heavily used on all CPU cores during sync. This change makes it use atomic.Value + a writer lock instead of sync.RWMutex. In the common case where the typeinfo entry is present in the cache, we simply fetch the map and lookup the type. * rlp: optimize byte array handling (#22924) This change improves the performance of encoding/decoding [N]byte. name old time/op new time/op delta DecodeByteArrayStruct-8 336ns ± 0% 246ns ± 0% -26.98% (p=0.000 n=9+10) EncodeByteArrayStruct-8 225ns ± 1% 148ns ± 1% -34.12% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeByteArrayStruct-8 120B ± 0% 48B ± 0% -60.00% (p=0.000 n=10+10) EncodeByteArrayStruct-8 0.00B 0.00B ~ (all equal) * rlp: optimize big.Int decoding for size <= 32 bytes (#22927) This change grows the static integer buffer in Stream to 32 bytes, making it possible to decode 256bit integers without allocating a temporary buffer. In the recent commit 088da24, Stream struct size decreased from 120 bytes down to 88 bytes. This commit grows the struct to 112 bytes again, but the size change will not degrade performance because Stream instances are internally cached in sync.Pool. name old time/op new time/op delta DecodeBigInts-8 12.2µs ± 0% 8.6µs ± 4% -29.58% (p=0.000 n=9+10) name old speed new speed delta DecodeBigInts-8 230MB/s ± 0% 326MB/s ± 4% +42.04% (p=0.000 n=9+10) * eth/protocols/eth, les: avoid Raw() when decoding HashOrNumber (#22841) Getting the raw value is not necessary to decode this type, and decoding it directly from the stream is faster. * fix testcase * debug no lazy * fix can not repair * address comments Co-authored-by: Felix Lange <fjl@twurst.com>
2021-07-29 12:16:53 +03:00
return len(l.txs.items)
}
// Empty returns whether the list of transactions is empty or not.
func (l *txList) Empty() bool {
return l.Len() == 0
}
// Flatten creates a nonce-sorted slice of transactions based on the loosely
// sorted internal representation. The result of the sorting is cached in case
// it's requested again before any modifications are made to the contents.
func (l *txList) Flatten() types.Transactions {
return l.txs.Flatten()
}
// LastElement returns the last element of a flattened list, thus, the
// transaction with the highest nonce
func (l *txList) LastElement() *types.Transaction {
return l.txs.LastElement()
}
// priceHeap is a heap.Interface implementation over transactions for retrieving
// price-sorted transactions to discard when the pool fills up.
type priceHeap []*types.Transaction
func (h priceHeap) Len() int { return len(h) }
func (h priceHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h priceHeap) Less(i, j int) bool {
// Sort primarily by price, returning the cheaper one
switch h[i].GasPriceCmp(h[j]) {
case -1:
return true
case 1:
return false
}
// If the prices match, stabilize via nonces (high nonce is worse)
return h[i].Nonce() > h[j].Nonce()
}
func (h *priceHeap) Push(x interface{}) {
*h = append(*h, x.(*types.Transaction))
}
func (h *priceHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
old[n-1] = nil
*h = old[0 : n-1]
return x
}
// txPricedList is a price-sorted heap to allow operating on transactions pool
// contents in a price-incrementing way. It's built opon the all transactions
// in txpool but only interested in the remote part. It means only remote transactions
// will be considered for tracking, sorting, eviction, etc.
type txPricedList struct {
all *txLookup // Pointer to the map of all transactions
remotes *priceHeap // Heap of prices of all the stored **remote** transactions
stales int // Number of stale price points to (re-heap trigger)
}
// newTxPricedList creates a new price-sorted transaction heap.
func newTxPricedList(all *txLookup) *txPricedList {
return &txPricedList{
all: all,
remotes: new(priceHeap),
}
}
// Put inserts a new transaction into the heap.
func (l *txPricedList) Put(tx *types.Transaction, local bool) {
if local {
return
}
heap.Push(l.remotes, tx)
}
// Removed notifies the prices transaction list that an old transaction dropped
// from the pool. The list will just keep a counter of stale objects and update
// the heap if a large enough ratio of transactions go stale.
func (l *txPricedList) Removed(count int) {
// Bump the stale counter, but exit if still too low (< 25%)
l.stales += count
if l.stales <= len(*l.remotes)/4 {
return
}
// Seems we've reached a critical number of stale transactions, reheap
l.Reheap()
}
// Cap finds all the transactions below the given price threshold, drops them
// from the priced list and returns them for further removal from the entire pool.
//
// Note: only remote transactions will be considered for eviction.
func (l *txPricedList) Cap(threshold *big.Int) types.Transactions {
drop := make(types.Transactions, 0, 128) // Remote underpriced transactions to drop
for len(*l.remotes) > 0 {
// Discard stale transactions if found during cleanup
cheapest := (*l.remotes)[0]
if l.all.GetRemote(cheapest.Hash()) == nil { // Removed or migrated
heap.Pop(l.remotes)
l.stales--
continue
}
// Stop the discards if we've reached the threshold
if cheapest.GasPriceIntCmp(threshold) >= 0 {
break
}
heap.Pop(l.remotes)
drop = append(drop, cheapest)
}
return drop
}
// Underpriced checks whether a transaction is cheaper than (or as cheap as) the
// lowest priced (remote) transaction currently being tracked.
func (l *txPricedList) Underpriced(tx *types.Transaction) bool {
// Discard stale price points if found at the heap start
for len(*l.remotes) > 0 {
head := []*types.Transaction(*l.remotes)[0]
if l.all.GetRemote(head.Hash()) == nil { // Removed or migrated
l.stales--
heap.Pop(l.remotes)
continue
}
break
}
// Check if the transaction is underpriced or not
if len(*l.remotes) == 0 {
return false // There is no remote transaction at all.
}
// If the remote transaction is even cheaper than the
// cheapest one tracked locally, reject it.
cheapest := []*types.Transaction(*l.remotes)[0]
return cheapest.GasPriceCmp(tx) >= 0
}
// Discard finds a number of most underpriced transactions, removes them from the
// priced list and returns them for further removal from the entire pool.
//
// Note local transaction won't be considered for eviction.
func (l *txPricedList) Discard(slots int, force bool) (types.Transactions, bool) {
drop := make(types.Transactions, 0, slots) // Remote underpriced transactions to drop
for len(*l.remotes) > 0 && slots > 0 {
// Discard stale transactions if found during cleanup
tx := heap.Pop(l.remotes).(*types.Transaction)
if l.all.GetRemote(tx.Hash()) == nil { // Removed or migrated
l.stales--
continue
}
// Non stale transaction found, discard it
drop = append(drop, tx)
slots -= numSlots(tx)
}
// If we still can't make enough room for the new transaction
if slots > 0 && !force {
for _, tx := range drop {
heap.Push(l.remotes, tx)
}
return nil, false
}
return drop, true
}
// Reheap forcibly rebuilds the heap based on the current remote transaction set.
func (l *txPricedList) Reheap() {
reheap := make(priceHeap, 0, l.all.RemoteCount())
l.stales, l.remotes = 0, &reheap
l.all.Range(func(hash common.Hash, tx *types.Transaction, local bool) bool {
*l.remotes = append(*l.remotes, tx)
return true
}, false, true) // Only iterate remotes
heap.Init(l.remotes)
}