From 88da33c7e4fef9cdc68f137be05f92c0acbae5c2 Mon Sep 17 00:00:00 2001
From: Bryan Stitt <bryan@llamanodes.com>
Date: Wed, 15 Feb 2023 12:33:43 -0800
Subject: [PATCH] i think it works

---
 web3_proxy/src/rpcs/blockchain.rs |  30 ++++++--
 web3_proxy/src/rpcs/consensus.rs  |  88 ++++++++++++++++--------
 web3_proxy/src/rpcs/many.rs       |   5 +-
 web3_proxy/src/rpcs/one.rs        | 109 +++++++++++++++++++++---------
 web3_proxy/src/rpcs/request.rs    |  12 ++++
 5 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/web3_proxy/src/rpcs/blockchain.rs b/web3_proxy/src/rpcs/blockchain.rs
index 9aa018a0..cd8957f5 100644
--- a/web3_proxy/src/rpcs/blockchain.rs
+++ b/web3_proxy/src/rpcs/blockchain.rs
@@ -415,6 +415,8 @@ impl Web3Rpcs {
 
         // TODO: what should we do if the block number of new_synced_connections is < old_synced_connections? wait?
 
+        let consensus_tier = new_synced_connections.tier;
+        let total_tiers = consensus_finder.len();
         let backups_needed = new_synced_connections.backups_needed;
         let consensus_head_block = new_synced_connections.head_block.clone();
         let num_consensus_rpcs = new_synced_connections.num_conns();
@@ -434,7 +436,9 @@ impl Web3Rpcs {
             match &old_consensus_head_connections.head_block {
                 None => {
                     debug!(
-                        "first {}{}/{}/{} block={}, rpc={}",
+                        "first {}/{} {}{}/{}/{} block={}, rpc={}",
+                        consensus_tier,
+                        total_tiers,
                         backups_voted_str,
                         num_consensus_rpcs,
                         num_active_rpcs,
@@ -469,7 +473,9 @@ impl Web3Rpcs {
                                 // no change in hash. no need to use head_block_sender
                                 // TODO: trace level if rpc is backup
                                 debug!(
-                                    "con {}{}/{}/{} con={} rpc={}@{}",
+                                    "con {}/{} {}{}/{}/{} con={} rpc={}@{}",
+                                    consensus_tier,
+                                    total_tiers,
                                     backups_voted_str,
                                     num_consensus_rpcs,
                                     num_active_rpcs,
@@ -486,7 +492,9 @@ impl Web3Rpcs {
                                 }
 
                                 debug!(
-                                    "unc {}{}/{}/{} con_head={} old={} rpc={}@{}",
+                                    "unc {}/{} {}{}/{}/{} con_head={} old={} rpc={}@{}",
+                                    consensus_tier,
+                                    total_tiers,
                                     backups_voted_str,
                                     num_consensus_rpcs,
                                     num_active_rpcs,
@@ -511,7 +519,9 @@ impl Web3Rpcs {
                             // this is unlikely but possible
                             // TODO: better log
                             warn!(
-                                "chain rolled back {}{}/{}/{} con={} old={} rpc={}@{}",
+                                "chain rolled back {}/{} {}{}/{}/{} con={} old={} rpc={}@{}",
+                                consensus_tier,
+                                total_tiers,
                                 backups_voted_str,
                                 num_consensus_rpcs,
                                 num_active_rpcs,
@@ -541,7 +551,9 @@ impl Web3Rpcs {
                         }
                         Ordering::Greater => {
                             debug!(
-                                "new {}{}/{}/{} con={} rpc={}@{}",
+                                "new {}/{} {}{}/{}/{} con={} rpc={}@{}",
+                                consensus_tier,
+                                total_tiers,
                                 backups_voted_str,
                                 num_consensus_rpcs,
                                 num_active_rpcs,
@@ -573,7 +585,9 @@ impl Web3Rpcs {
             if num_active_rpcs >= self.min_head_rpcs {
                 // no consensus!!!
                 error!(
-                    "non {}{}/{}/{} rpc={}@{}",
+                    "non {}/{} {}{}/{}/{} rpc={}@{}",
+                    consensus_tier,
+                    total_tiers,
                     backups_voted_str,
                     num_consensus_rpcs,
                     num_active_rpcs,
@@ -584,7 +598,9 @@ impl Web3Rpcs {
             } else {
                 // no consensus, but we do not have enough rpcs connected yet to panic
                 debug!(
-                    "non {}{}/{}/{} rpc={}@{}",
+                    "non {}/{} {}{}/{}/{} rpc={}@{}",
+                    consensus_tier,
+                    total_tiers,
                     backups_voted_str,
                     num_consensus_rpcs,
                     num_active_rpcs,
diff --git a/web3_proxy/src/rpcs/consensus.rs b/web3_proxy/src/rpcs/consensus.rs
index 847892cf..62901b59 100644
--- a/web3_proxy/src/rpcs/consensus.rs
+++ b/web3_proxy/src/rpcs/consensus.rs
@@ -7,15 +7,18 @@ use anyhow::Context;
 use ethers::prelude::{H256, U64};
 use hashbrown::{HashMap, HashSet};
 use log::{debug, trace, warn};
+use moka::future::Cache;
 use serde::Serialize;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::sync::Arc;
+use tokio::time::Instant;
 
 /// A collection of Web3Rpcs that are on the same block.
 /// Serialize is so we can print it on our debug endpoint
 #[derive(Clone, Default, Serialize)]
 pub struct ConsensusWeb3Rpcs {
+    pub(super) tier: u64,
     pub(super) head_block: Option<Web3ProxyBlock>,
     // TODO: this should be able to serialize, but it isn't
     #[serde(skip_serializing)]
@@ -74,22 +77,25 @@ impl Web3Rpcs {
     }
 }
 
+type FirstSeenCache = Cache<H256, Instant, hashbrown::hash_map::DefaultHashBuilder>;
+
 pub struct ConnectionsGroup {
     rpc_name_to_block: HashMap<String, Web3ProxyBlock>,
     // TODO: what if there are two blocks with the same number?
     highest_block: Option<Web3ProxyBlock>,
-}
-
-impl Default for ConnectionsGroup {
-    fn default() -> Self {
-        Self {
-            rpc_name_to_block: Default::default(),
-            highest_block: Default::default(),
-        }
-    }
+    /// used to track rpc.head_latency. The same cache should be shared between all ConnectionsGroups
+    first_seen: FirstSeenCache,
 }
 
 impl ConnectionsGroup {
+    pub fn new(first_seen: FirstSeenCache) -> Self {
+        Self {
+            rpc_name_to_block: Default::default(),
+            highest_block: Default::default(),
+            first_seen,
+        }
+    }
+
     pub fn len(&self) -> usize {
         self.rpc_name_to_block.len()
     }
@@ -115,7 +121,17 @@ impl ConnectionsGroup {
         }
     }
 
-    fn insert(&mut self, rpc: &Web3Rpc, block: Web3ProxyBlock) -> Option<Web3ProxyBlock> {
+    async fn insert(&mut self, rpc: &Web3Rpc, block: Web3ProxyBlock) -> Option<Web3ProxyBlock> {
+        let first_seen = self
+            .first_seen
+            .get_with(*block.hash(), async move { Instant::now() })
+            .await;
+
+        // TODO: this should be 0 if we are first seen, but i think it will be slightly non-zero
+        rpc.head_latency
+            .write()
+            .record(first_seen.elapsed().as_secs_f64() * 1000.0);
+
         // TODO: what about a reorg to the same height?
         if Some(block.number()) > self.highest_block.as_ref().map(|x| x.number()) {
             self.highest_block = Some(block.clone());
@@ -179,6 +195,7 @@ impl ConnectionsGroup {
         authorization: &Arc<Authorization>,
         web3_rpcs: &Web3Rpcs,
         min_consensus_block_num: Option<U64>,
+        tier: &u64,
     ) -> anyhow::Result<ConsensusWeb3Rpcs> {
         let mut maybe_head_block = match self.highest_block.clone() {
             None => return Err(anyhow::anyhow!("no blocks known")),
@@ -191,13 +208,18 @@ impl ConnectionsGroup {
             if let Some(min_consensus_block_num) = min_consensus_block_num {
                 maybe_head_block
                     .number()
+                    .saturating_add(1.into())
                     .saturating_sub(min_consensus_block_num)
                     .as_u64()
             } else {
-                // TODO: get from app config? different chains probably should have different values. 10 is probably too much
                 10
             };
 
+        trace!(
+            "max_lag_consensus_to_highest: {}",
+            max_lag_consensus_to_highest
+        );
+
         let num_known = self.rpc_name_to_block.len();
 
         if num_known < web3_rpcs.min_head_rpcs {
@@ -338,7 +360,7 @@ impl ConnectionsGroup {
         }
 
         // success! this block has enough soft limit and nodes on it (or on later blocks)
-        let conns: Vec<Arc<Web3Rpc>> = primary_consensus_rpcs
+        let rpcs: Vec<Arc<Web3Rpc>> = primary_consensus_rpcs
             .into_iter()
             .filter_map(|conn_name| web3_rpcs.by_name.get(conn_name).cloned())
             .collect();
@@ -349,8 +371,9 @@ impl ConnectionsGroup {
         let _ = maybe_head_block.number();
 
         Ok(ConsensusWeb3Rpcs {
+            tier: *tier,
             head_block: Some(maybe_head_block),
-            rpcs: conns,
+            rpcs,
             backups_voted: backup_rpcs_voted,
             backups_needed: primary_rpcs_voted.is_none(),
         })
@@ -377,10 +400,15 @@ impl ConsensusFinder {
         max_block_age: Option<u64>,
         max_block_lag: Option<U64>,
     ) -> Self {
+        // TODO: what's a good capacity for this?
+        let first_seen = Cache::builder()
+            .max_capacity(16)
+            .build_with_hasher(hashbrown::hash_map::DefaultHashBuilder::default());
+
         // TODO: this will need some thought when config reloading is written
         let tiers = configured_tiers
             .iter()
-            .map(|x| (*x, Default::default()))
+            .map(|x| (*x, ConnectionsGroup::new(first_seen.clone())))
             .collect();
 
         Self {
@@ -389,9 +417,11 @@ impl ConsensusFinder {
             max_block_lag,
         }
     }
-}
 
-impl ConsensusFinder {
+    pub fn len(&self) -> usize {
+        self.tiers.len()
+    }
+
     /// get the ConnectionsGroup that contains all rpcs
     /// panics if there are no tiers
     pub fn all_rpcs_group(&self) -> Option<&ConnectionsGroup> {
@@ -421,7 +451,11 @@ impl ConsensusFinder {
     }
 
     /// returns the block that the rpc was on before updating to the new_block
-    pub fn insert(&mut self, rpc: &Web3Rpc, new_block: Web3ProxyBlock) -> Option<Web3ProxyBlock> {
+    pub async fn insert(
+        &mut self,
+        rpc: &Web3Rpc,
+        new_block: Web3ProxyBlock,
+    ) -> Option<Web3ProxyBlock> {
         let mut old = None;
 
         // TODO: error if rpc.tier is not in self.tiers
@@ -432,7 +466,7 @@ impl ConsensusFinder {
             }
 
             // TODO: should new_block be a ref?
-            let x = tier_group.insert(rpc, new_block.clone());
+            let x = tier_group.insert(rpc, new_block.clone()).await;
 
             if old.is_none() && x.is_some() {
                 old = x;
@@ -473,7 +507,7 @@ impl ConsensusFinder {
                     }
                 }
 
-                if let Some(prev_block) = self.insert(&rpc, rpc_head_block.clone()) {
+                if let Some(prev_block) = self.insert(&rpc, rpc_head_block.clone()).await {
                     if prev_block.hash() == rpc_head_block.hash() {
                         // this block was already sent by this rpc. return early
                         false
@@ -527,13 +561,13 @@ impl ConsensusFinder {
         // TODO: how should errors be handled?
         // TODO: find the best tier with a connectionsgroup. best case, this only queries the first tier
         // TODO: do we need to calculate all of them? I think having highest_known_block included as part of min_block_num should make that unnecessary
-        for (i, x) in self.tiers.iter() {
-            trace!("checking tier {}: {:#?}", i, x.rpc_name_to_block);
+        for (tier, x) in self.tiers.iter() {
+            trace!("checking tier {}: {:#?}", tier, x.rpc_name_to_block);
             if let Ok(consensus_head_connections) = x
-                .consensus_head_connections(authorization, web3_connections, min_block_num)
+                .consensus_head_connections(authorization, web3_connections, min_block_num, tier)
                 .await
             {
-                trace!("success on tier {}", i);
+                trace!("success on tier {}", tier);
                 // we got one! hopefully it didn't need to use any backups.
                 // but even if it did need backup servers, that is better than going to a worse tier
                 return Ok(consensus_head_connections);
@@ -546,8 +580,8 @@ impl ConsensusFinder {
 
 #[cfg(test)]
 mod test {
-    #[test]
-    fn test_simplest_case_consensus_head_connections() {
-        todo!();
-    }
+    // #[test]
+    // fn test_simplest_case_consensus_head_connections() {
+    //     todo!();
+    // }
 }
diff --git a/web3_proxy/src/rpcs/many.rs b/web3_proxy/src/rpcs/many.rs
index 19958016..4a4d1995 100644
--- a/web3_proxy/src/rpcs/many.rs
+++ b/web3_proxy/src/rpcs/many.rs
@@ -458,10 +458,7 @@ impl Web3Rpcs {
                                 max_block_needed
                             ))
                         }
-                        cmp::Ordering::Less => {
-                            // hmmmm
-                            todo!("now what do we do?");
-                        }
+                        cmp::Ordering::Less => min_block_needed.cmp(head_block_num),
                     }
                 }
             };
diff --git a/web3_proxy/src/rpcs/one.rs b/web3_proxy/src/rpcs/one.rs
index 5b030bad..8bc94243 100644
--- a/web3_proxy/src/rpcs/one.rs
+++ b/web3_proxy/src/rpcs/one.rs
@@ -21,33 +21,74 @@ use serde_json::json;
 use std::cmp::min;
 use std::fmt;
 use std::hash::{Hash, Hasher};
-use std::sync::atomic::{self, AtomicU64};
+use std::sync::atomic::{self, AtomicU64, AtomicUsize};
 use std::{cmp::Ordering, sync::Arc};
 use thread_fast_rng::rand::Rng;
 use thread_fast_rng::thread_fast_rng;
 use tokio::sync::{broadcast, oneshot, watch, RwLock as AsyncRwLock};
 use tokio::time::{sleep, sleep_until, timeout, Duration, Instant};
 
-pub struct Web3RpcLatencies {
-    /// Traack how far behind the fastest node we are
-    pub new_head: Histogram<u64>,
-    /// exponentially weighted moving average of how far behind the fastest node we are
-    pub new_head_ewma: u32,
-    /// Track how long an rpc call takes on average
-    pub request: Histogram<u64>,
-    /// exponentially weighted moving average of how far behind the fastest node we are
-    pub request_ewma: u32,
+pub struct Latency {
+    /// Track how many milliseconds slower we are than the fastest node
+    pub histogram: Histogram<u64>,
+    /// exponentially weighted moving average of how many milliseconds behind the fastest node we are
+    pub ewma: ewma::EWMA,
 }
 
-impl Default for Web3RpcLatencies {
+impl Serialize for Latency {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut state = serializer.serialize_struct("latency", 6)?;
+
+        state.serialize_field("ewma_ms", &self.ewma.value())?;
+
+        state.serialize_field("histogram_len", &self.histogram.len())?;
+        state.serialize_field("mean_ms", &self.histogram.mean())?;
+        state.serialize_field("p50_ms", &self.histogram.value_at_quantile(0.50))?;
+        state.serialize_field("p75_ms", &self.histogram.value_at_quantile(0.75))?;
+        state.serialize_field("p99_ms", &self.histogram.value_at_quantile(0.99))?;
+
+        state.end()
+    }
+}
+
+impl Latency {
+    pub fn record(&mut self, milliseconds: f64) {
+        self.ewma.add(milliseconds);
+
+        // histogram needs ints and not floats
+        self.histogram.record(milliseconds as u64).unwrap();
+    }
+}
+
+impl Default for Latency {
     fn default() -> Self {
-        todo!("use ewma crate, not u32");
-        Self {
-            new_head: Histogram::new(3).unwrap(),
-            new_head_ewma: 0,
-            request: Histogram::new(3).unwrap(),
-            request_ewma: 0,
-        }
+        // TODO: what should the default sigfig be?
+        let sigfig = 0;
+
+        // TODO: what should the default span be? 25 requests? have a "new"
+        let span = 25.0;
+
+        Self::new(sigfig, span).expect("default histogram sigfigs should always work")
+    }
+}
+
+impl Latency {
+    pub fn new(sigfig: u8, span: f64) -> Result<Self, hdrhistogram::CreationError> {
+        let alpha = Self::span_to_alpha(span);
+
+        let histogram = Histogram::new(sigfig)?;
+
+        Ok(Self {
+            histogram,
+            ewma: ewma::EWMA::new(alpha),
+        })
+    }
+
+    fn span_to_alpha(span: f64) -> f64 {
+        2.0 / (span + 1.0)
     }
 }
 
@@ -83,8 +124,13 @@ pub struct Web3Rpc {
     pub(super) tier: u64,
     /// TODO: change this to a watch channel so that http providers can subscribe and take action on change.
     pub(super) head_block: RwLock<Option<Web3ProxyBlock>>,
-    /// Track how fast this RPC is
-    pub(super) latency: Web3RpcLatencies,
+    /// Track head block latency
+    pub(super) head_latency: RwLock<Latency>,
+    /// Track request latency
+    pub(super) request_latency: RwLock<Latency>,
+    /// Track total requests served
+    /// TODO: maybe move this to graphana
+    pub(super) total_requests: AtomicUsize,
 }
 
 impl Web3Rpc {
@@ -1081,7 +1127,7 @@ impl Serialize for Web3Rpc {
         S: Serializer,
     {
         // 3 is the number of fields in the struct.
-        let mut state = serializer.serialize_struct("Web3Rpc", 9)?;
+        let mut state = serializer.serialize_struct("Web3Rpc", 10)?;
 
         // the url is excluded because it likely includes private information. just show the name that we use in keys
         state.serialize_field("name", &self.name)?;
@@ -1103,17 +1149,17 @@ impl Serialize for Web3Rpc {
 
         state.serialize_field("soft_limit", &self.soft_limit)?;
 
-        // TODO: keep this for the "popularity_contest" command? or maybe better to just use graphana?
-        // state.serialize_field(
-        //     "frontend_requests",
-        //     &self.frontend_requests.load(atomic::Ordering::Relaxed),
-        // )?;
+        // TODO: maybe this is too much data. serialize less?
+        state.serialize_field("head_block", &*self.head_block.read())?;
 
-        {
-            // TODO: maybe this is too much data. serialize less?
-            let head_block = &*self.head_block.read();
-            state.serialize_field("head_block", head_block)?;
-        }
+        state.serialize_field("head_latency", &*self.head_latency.read())?;
+
+        state.serialize_field("request_latency", &*self.request_latency.read())?;
+
+        state.serialize_field(
+            "total_requests",
+            &self.total_requests.load(atomic::Ordering::Relaxed),
+        )?;
 
         state.end()
     }
@@ -1207,7 +1253,6 @@ mod tests {
 
         let block_data_limit = 64;
 
-        // TODO: this is getting long. have a `impl Default`
         let x = Web3Rpc {
             name: "name".to_string(),
             soft_limit: 1_000,
diff --git a/web3_proxy/src/rpcs/request.rs b/web3_proxy/src/rpcs/request.rs
index b3f4864a..7a2d735d 100644
--- a/web3_proxy/src/rpcs/request.rs
+++ b/web3_proxy/src/rpcs/request.rs
@@ -183,6 +183,12 @@ impl OpenRequestHandle {
 
         let provider = provider.expect("provider was checked already");
 
+        self.rpc
+            .total_requests
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+
+        let start = Instant::now();
+
         // TODO: replace ethers-rs providers with our own that supports streaming the responses
         let response = match provider.as_ref() {
             #[cfg(test)]
@@ -367,6 +373,12 @@ impl OpenRequestHandle {
                     tokio::spawn(f);
                 }
             }
+        } else {
+            // TODO: locking now will slow us down. send latency into a channel instead
+            self.rpc
+                .request_latency
+                .write()
+                .record(start.elapsed().as_secs_f64() * 1000.0);
         }
 
         response