refactor metrics, add metric debug

This commit is contained in:
Felipe Andrade 2023-07-14 14:08:02 -07:00
parent b4f0ede95e
commit 668228c5c3
9 changed files with 106 additions and 47 deletions

@ -27,6 +27,7 @@ type SignerServiceConfig struct {
type MetricsConfig struct {
Enabled bool `toml:"enabled"`
Debug bool `toml:"debug"`
Host string `toml:"host"`
Port int `toml:"port"`
}
@ -54,9 +55,8 @@ type WalletConfig struct {
}
type ProviderConfig struct {
Disabled bool `toml:"disabled"`
Network string `toml:"network"`
URL string `toml:"url"`
Network string `toml:"network"`
URL string `toml:"url"`
ReadOnly bool `toml:"read_only"`
ReadInterval TOMLDuration `toml:"read_interval"`

@ -33,7 +33,7 @@ func (i *InstrumentedEthClient) TransactionByHash(ctx context.Context, hash comm
start := time.Now()
tx, isPending, err := i.c.TransactionByHash(ctx, hash)
if err != nil {
if !i.IgnorableErrors(err) {
if !i.ignorableErrors(err) {
metrics.RecordError(i.providerName, "ethclient.TransactionByHash")
}
return nil, false, err
@ -57,7 +57,7 @@ func (i *InstrumentedEthClient) TransactionReceipt(ctx context.Context, txHash c
start := time.Now()
receipt, err := i.c.TransactionReceipt(ctx, txHash)
if err != nil {
if !i.IgnorableErrors(err) {
if !i.ignorableErrors(err) {
metrics.RecordError(i.providerName, "ethclient.TransactionReceipt")
}
return nil, err
@ -70,7 +70,7 @@ func (i *InstrumentedEthClient) SendTransaction(ctx context.Context, tx *types.T
start := time.Now()
err := i.c.SendTransaction(ctx, tx)
if err != nil {
if !i.IgnorableErrors(err) {
if !i.ignorableErrors(err) {
metrics.RecordError(i.providerName, "ethclient.SendTransaction")
}
return err
@ -79,7 +79,7 @@ func (i *InstrumentedEthClient) SendTransaction(ctx context.Context, tx *types.T
return err
}
func (i *InstrumentedEthClient) IgnorableErrors(err error) bool {
func (i *InstrumentedEthClient) ignorableErrors(err error) bool {
msg := err.Error()
// we dont use errors.Is because eth client actually uses errors.New,
// therefore creating an incomparable instance :(

@ -3,6 +3,7 @@ package metrics
import (
"time"
"github.com/ethereum/go-ethereum/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
@ -12,6 +13,8 @@ const (
)
var (
Debug bool
errorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: MetricsNamespace,
Name: "errors_total",
@ -75,29 +78,57 @@ var (
)
func RecordError(provider string, errorLabel string) {
if Debug {
log.Debug("metric inc", "m", "errors_total",
"provider", provider, "error", errorLabel)
}
errorsTotal.WithLabelValues(provider, errorLabel).Inc()
}
func RecordRPCLatency(provider string, client string, method string, latency time.Duration) {
if Debug {
log.Debug("metric set", "m", "rpc_latency",
"provider", provider, "client", client, "method", method, "latency", latency)
}
rpcLatency.WithLabelValues(provider, client, method).Set(float64(latency.Milliseconds()))
}
func RecordRoundTripLatency(provider string, latency time.Duration) {
if Debug {
log.Debug("metric set", "m", "roundtrip_latency",
"provider", provider, "latency", latency)
}
roundTripLatency.WithLabelValues(provider).Set(float64(latency.Milliseconds()))
}
func RecordGasUsed(provider string, val uint64) {
gasUsed.WithLabelValues(provider).Set(float64(val))
if Debug {
log.Debug("metric add", "m", "gas_used",
"provider", provider, "val", val)
}
gasUsed.WithLabelValues(provider).Add(float64(val))
}
func RecordFirstSeenLatency(provider_source string, provider_seen string, latency time.Duration) {
firstSeenLatency.WithLabelValues(provider_source, provider_seen).Set(float64(latency.Milliseconds()))
func RecordFirstSeenLatency(providerSource string, providerSeen string, latency time.Duration) {
if Debug {
log.Debug("metric set", "m", "first_seen_latency",
"provider_source", providerSource, "provider_seen", providerSeen, "latency", latency)
}
firstSeenLatency.WithLabelValues(providerSource, providerSeen).Set(float64(latency.Milliseconds()))
}
func RecordProviderToProviderLatency(provider_source string, provider_seen string, latency time.Duration) {
firstSeenLatency.WithLabelValues(provider_source, provider_seen).Set(float64(latency.Milliseconds()))
func RecordProviderToProviderLatency(providerSource string, providerSeen string, latency time.Duration) {
if Debug {
log.Debug("metric set", "m", "provider_to_provider_latency",
"provider_source", providerSource, "provider_seen", providerSeen, "latency", latency)
}
providerToProviderLatency.WithLabelValues(providerSource, providerSeen).Set(float64(latency.Milliseconds()))
}
func RecordTransactionsInFlight(network string, count int) {
if Debug {
log.Debug("metric set", "m", "transactions_inflight",
"network", network, "count", count)
}
networkTransactionsInFlight.WithLabelValues(network).Set(float64(count))
}

@ -55,19 +55,19 @@ func (p *Provider) Heartbeat(ctx context.Context) {
// mark transaction as seen by this provider
st.M.Lock()
latency := time.Since(st.SentAt)
if st.FirstSeen.IsZero() {
st.FirstSeen = time.Now()
firstSeenLatency := time.Since(st.SentAt)
metrics.RecordFirstSeenLatency(st.ProviderSentTo, p.name, time.Since(st.SentAt))
metrics.RecordFirstSeenLatency(st.ProviderSentTo, p.name, latency)
log.Info("transaction first seen",
"hash", hash,
"firstSeenLatency", firstSeenLatency,
"provider_source", st.ProviderSentTo,
"provider_seen", p.name)
"firstSeenLatency", latency,
"providerSource", st.ProviderSentTo,
"providerSeen", p.name)
}
if _, exist := st.SeenBy[p.name]; !exist {
st.SeenBy[p.name] = time.Now()
metrics.RecordProviderToProviderLatency(st.ProviderSentTo, p.name, time.Since(st.SentAt))
metrics.RecordProviderToProviderLatency(st.ProviderSentTo, p.name, latency)
}
st.M.Unlock()

@ -2,7 +2,6 @@ package provider
import (
"context"
"net/http"
"op-ufm/pkg/config"
"time"
)
@ -13,9 +12,8 @@ type Provider struct {
signerConfig *config.SignerServiceConfig
walletConfig *config.WalletConfig
txPool *NetworkTransactionPool
cancelFunc context.CancelFunc
client *http.Client
cancelFunc context.CancelFunc
}
func New(name string, cfg *config.ProviderConfig,
@ -28,8 +26,6 @@ func New(name string, cfg *config.ProviderConfig,
signerConfig: signerConfig,
walletConfig: walletConfig,
txPool: txPool,
client: http.DefaultClient,
}
return p
}
@ -37,6 +33,7 @@ func New(name string, cfg *config.ProviderConfig,
func (p *Provider) Start(ctx context.Context) {
providerCtx, cancelFunc := context.WithCancel(ctx)
p.cancelFunc = cancelFunc
schedule(providerCtx, time.Duration(p.config.ReadInterval), p.Heartbeat)
if !p.config.ReadOnly {
schedule(providerCtx, time.Duration(p.config.SendInterval), p.RoundTrip)

@ -20,7 +20,7 @@ import (
// RoundTrip send a new transaction to measure round trip latency
func (p *Provider) RoundTrip(ctx context.Context) {
log.Debug("roundtrip", "provider", p.name)
log.Debug("roundTripLatency", "provider", p.name)
client, err := iclients.Dial(p.name, p.config.URL)
if err != nil {
@ -36,7 +36,10 @@ func (p *Provider) RoundTrip(ctx context.Context) {
txHash := common.Hash{}
attempt := 0
startedAt := time.Now()
// used for timeout
firstAttemptAt := time.Now()
// used for actual round trip time (disregard retry time)
roundTripStartedAt := time.Now()
for {
tx := p.createTx(nonce)
txHash = tx.Hash()
@ -49,11 +52,12 @@ func (p *Provider) RoundTrip(ctx context.Context) {
txHash = signedTx.Hash()
roundTripStartedAt = time.Now()
err = client.SendTransaction(ctx, signedTx)
if err != nil {
if err.Error() == txpool.ErrAlreadyKnown.Error() || err.Error() == core.ErrNonceTooLow.Error() {
if time.Since(startedAt) >= time.Duration(p.config.SendTransactionRetryTimeout) {
log.Error("send transaction timed out (known already)", "provider", p.name, "hash", txHash.Hex(), "elapsed", time.Since(startedAt), "attempt", attempt, "nonce", nonce)
if time.Since(firstAttemptAt) >= time.Duration(p.config.SendTransactionRetryTimeout) {
log.Error("send transaction timed out (known already)", "provider", p.name, "hash", txHash.Hex(), "elapsed", time.Since(firstAttemptAt), "attempt", attempt, "nonce", nonce)
metrics.RecordError(p.name, "ethclient.SendTransaction.nonce")
return
}
@ -62,7 +66,7 @@ func (p *Provider) RoundTrip(ctx context.Context) {
nonce++
attempt++
if attempt%10 == 0 {
log.Debug("retrying send transaction...", "provider", p.name, "attempt", attempt, "nonce", nonce, "elapsed", time.Since(startedAt))
log.Debug("retrying send transaction...", "provider", p.name, "attempt", attempt, "nonce", nonce, "elapsed", time.Since(firstAttemptAt))
}
} else {
log.Error("cant send transaction", "provider", p.name, "err", err)
@ -104,13 +108,14 @@ func (p *Provider) RoundTrip(ctx context.Context) {
}
attempt++
}
roundtrip := time.Since(sentAt)
metrics.RecordRoundTripLatency(p.name, roundtrip)
roundTripLatency := time.Since(roundTripStartedAt)
metrics.RecordRoundTripLatency(p.name, roundTripLatency)
metrics.RecordGasUsed(p.name, receipt.GasUsed)
log.Info("got transaction receipt", "hash", txHash.Hex(),
"roundtrip", roundtrip,
"roundTripLatency", roundTripLatency,
"provider", p.name,
"blockNumber", receipt.BlockNumber,
"blockHash", receipt.BlockHash,

@ -9,12 +9,12 @@ import (
"github.com/rs/cors"
)
type Healthz struct {
type HealthzServer struct {
ctx context.Context
server *http.Server
}
func (h *Healthz) Start(ctx context.Context, host string, port int) error {
func (h *HealthzServer) Start(ctx context.Context, host string, port int) error {
hdlr := mux.NewRouter()
hdlr.HandleFunc("/healthz", h.Handle).Methods("GET")
addr := fmt.Sprintf("%s:%d", host, port)
@ -30,10 +30,10 @@ func (h *Healthz) Start(ctx context.Context, host string, port int) error {
return h.server.ListenAndServe()
}
func (h *Healthz) Shutdown() error {
func (h *HealthzServer) Shutdown() error {
return h.server.Shutdown(h.ctx)
}
func (h *Healthz) Handle(w http.ResponseWriter, r *http.Request) {
func (h *HealthzServer) Handle(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("OK"))
}

@ -0,0 +1,27 @@
package service
import (
"context"
"net/http"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
type MetricsServer struct {
ctx context.Context
server *http.Server
}
func (m *MetricsServer) Start(ctx context.Context, addr string) error {
server := &http.Server{
Handler: promhttp.Handler(),
Addr: addr,
}
m.server = server
m.ctx = ctx
return m.server.ListenAndServe()
}
func (m *MetricsServer) Shutdown() error {
return m.server.Shutdown(m.ctx)
}

@ -3,24 +3,25 @@ package service
import (
"context"
"fmt"
"net/http"
"op-ufm/pkg/config"
"op-ufm/pkg/metrics"
"op-ufm/pkg/provider"
"github.com/ethereum/go-ethereum/log"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
type Service struct {
Config *config.Config
Healthz *Healthz
Healthz *HealthzServer
Metrics *MetricsServer
Providers map[string]*provider.Provider
}
func New(cfg *config.Config) *Service {
s := &Service{
Config: cfg,
Healthz: &Healthz{},
Healthz: &HealthzServer{},
Metrics: &MetricsServer{},
Providers: make(map[string]*provider.Provider, len(cfg.Providers)),
}
return s
@ -38,11 +39,12 @@ func (s *Service) Start(ctx context.Context) {
}()
}
metrics.Debug = s.Config.Metrics.Debug
if s.Config.Metrics.Enabled {
addr := fmt.Sprintf("%s:%d", s.Config.Metrics.Host, s.Config.Metrics.Port)
log.Info("starting metrics server", "addr", addr)
go func() {
if err := http.ListenAndServe(addr, promhttp.Handler()); err != nil {
if err := s.Metrics.Start(ctx, addr); err != nil {
log.Error("error starting metrics server", "err", err)
}
}()
@ -51,9 +53,6 @@ func (s *Service) Start(ctx context.Context) {
// map networks to its providers
networks := make(map[string][]string)
for name, providerConfig := range s.Config.Providers {
if providerConfig.Disabled {
continue
}
networks[providerConfig.Network] = append(networks[providerConfig.Network], name)
}
@ -70,10 +69,6 @@ func (s *Service) Start(ctx context.Context) {
}
for name, providerConfig := range s.Config.Providers {
if providerConfig.Disabled {
log.Info("provider is disabled", "provider", name)
continue
}
s.Providers[name] = provider.New(name,
providerConfig,
&s.Config.Signer,
@ -92,6 +87,10 @@ func (s *Service) Shutdown() {
s.Healthz.Shutdown()
log.Info("healthz stopped")
}
if s.Config.Metrics.Enabled {
s.Metrics.Shutdown()
log.Info("metrics stopped")
}
for name, provider := range s.Providers {
provider.Shutdown()
log.Info("provider stopped", "provider", name)