Bryan Stitt 3bbbdd5596
Flush stats (#161)
* create buffer if mysql OR influx is set up

* this one flush should error

* it helps to set up the db in the db migration test

* comment
2023-07-05 19:24:21 -07:00

331 lines
12 KiB

use super::{AppStat, RpcQueryKey};
use crate::app::{RpcSecretKeyCache, UserBalanceCache, Web3ProxyJoinHandle};
use crate::errors::Web3ProxyResult;
use crate::frontend::authorization::Balance;
use derive_more::From;
use futures::stream;
use hashbrown::HashMap;
use influxdb2::api::write::TimestampPrecision;
use migration::sea_orm::prelude::Decimal;
use migration::sea_orm::DatabaseConnection;
use std::time::Duration;
use tokio::sync::{broadcast, oneshot};
use tokio::time::{interval, sleep};
use tracing::{error, info, trace, warn};
#[derive(Debug, Default)]
pub struct BufferedRpcQueryStats {
pub frontend_requests: u64,
pub backend_requests: u64,
pub backend_retries: u64,
pub no_servers: u64,
pub cache_misses: u64,
pub cache_hits: u64,
pub sum_request_bytes: u64,
pub sum_response_bytes: u64,
pub sum_response_millis: u64,
pub sum_credits_used: Decimal,
pub sum_cu_used: Decimal,
/// The user's balance at this point in time. Multiple queries might be modifying it at once.
pub latest_balance: Balance,
pub struct SpawnedStatBuffer {
pub stat_sender: flume::Sender<AppStat>,
/// these handles are important and must be allowed to finish
pub background_handle: Web3ProxyJoinHandle<()>,
pub struct StatBuffer {
accounting_db_buffer: HashMap<RpcQueryKey, BufferedRpcQueryStats>,
billing_period_seconds: i64,
chain_id: u64,
db_conn: Option<DatabaseConnection>,
db_save_interval_seconds: u32,
global_timeseries_buffer: HashMap<RpcQueryKey, BufferedRpcQueryStats>,
influxdb_bucket: Option<String>,
influxdb_client: Option<influxdb2::Client>,
opt_in_timeseries_buffer: HashMap<RpcQueryKey, BufferedRpcQueryStats>,
rpc_secret_key_cache: RpcSecretKeyCache,
user_balance_cache: UserBalanceCache,
timestamp_precision: TimestampPrecision,
tsdb_save_interval_seconds: u32,
impl StatBuffer {
pub fn try_spawn(
billing_period_seconds: i64,
chain_id: u64,
db_conn: Option<DatabaseConnection>,
db_save_interval_seconds: u32,
influxdb_bucket: Option<String>,
mut influxdb_client: Option<influxdb2::Client>,
rpc_secret_key_cache: Option<RpcSecretKeyCache>,
user_balance_cache: Option<UserBalanceCache>,
shutdown_receiver: broadcast::Receiver<()>,
tsdb_save_interval_seconds: u32,
flush_receiver: flume::Receiver<oneshot::Sender<(usize, usize)>>,
) -> anyhow::Result<Option<SpawnedStatBuffer>> {
if influxdb_bucket.is_none() {
influxdb_client = None;
if db_conn.is_none() && influxdb_client.is_none() {
return Ok(None);
let (stat_sender, stat_receiver) = flume::unbounded();
let timestamp_precision = TimestampPrecision::Seconds;
let mut new = Self {
accounting_db_buffer: Default::default(),
global_timeseries_buffer: Default::default(),
opt_in_timeseries_buffer: Default::default(),
rpc_secret_key_cache: rpc_secret_key_cache.unwrap(),
user_balance_cache: user_balance_cache.unwrap(),
// any errors inside this task will cause the application to exit
// TODO? change this to the X and XTask pattern like the latency crate uses
let handle = tokio::spawn(async move {
new.aggregate_and_save_loop(stat_receiver, shutdown_receiver, flush_receiver)
Ok(Some((stat_sender, handle).into()))
async fn aggregate_and_save_loop(
&mut self,
stat_receiver: flume::Receiver<AppStat>,
mut shutdown_receiver: broadcast::Receiver<()>,
flush_receiver: flume::Receiver<oneshot::Sender<(usize, usize)>>,
) -> Web3ProxyResult<()> {
let mut tsdb_save_interval =
interval(Duration::from_secs(self.tsdb_save_interval_seconds as u64));
let mut db_save_interval =
interval(Duration::from_secs(self.db_save_interval_seconds as u64));
loop {
tokio::select! {
stat = stat_receiver.recv_async() => {
// trace!("Received stat");
// save the stat to a buffer
match stat {
Ok(AppStat::RpcQuery(stat)) => {
if self.influxdb_client.is_some() {
// TODO: round the timestamp at all?
let global_timeseries_key = stat.global_timeseries_key();
if let Some(opt_in_timeseries_key) = stat.owned_timeseries_key() {
if self.db_conn.is_some() {
Err(err) => {
info!("error receiving stat: {}", err);
_ = db_save_interval.tick() => {
trace!("DB save internal tick");
let count = self.save_relational_stats().await;
if count > 0 {
trace!("Saved {} stats to the relational db", count);
_ = tsdb_save_interval.tick() => {
trace!("TSDB save internal tick");
let count = self.save_tsdb_stats().await;
if count > 0 {
trace!("Saved {} stats to the tsdb", count);
x = flush_receiver.recv_async() => {
if let Ok(x) = x {
let tsdb_count = self.save_tsdb_stats().await;
if tsdb_count > 0 {
trace!("Flushed {} stats to the tsdb", tsdb_count);
let relational_count = self.save_relational_stats().await;
if relational_count > 0 {
trace!("Flushed {} stats to the relational db", relational_count);
if let Err(err) = x.send((tsdb_count, relational_count)) {
warn!(%tsdb_count, %relational_count, ?err, "unable to notify about flushed stats");
} else {
x = shutdown_receiver.recv() => {
match x {
Ok(_) => {
info!("stat_loop shutting down");
Err(err) => error!("stat_loop shutdown receiver err={:?}", err),
// TODO: wait on all websockets to close
// TODO: wait on all pending external requests to finish
info!("waiting 10 seconds for remaining stats to arrive");
// loop {
// // nope. this won't ever be true because we keep making stats for internal requests
// // if stat_receiver.is_disconnected() {
// // info!("stat_receiver is disconnected");
// // break;
// // }
// // TODO: don't just sleep. watch a channel
// sleep(Duration::from_millis(10)).await;
// }
let saved_relational = self.save_relational_stats().await;
info!("saved {} pending relational stat(s)", saved_relational);
let saved_tsdb = self.save_tsdb_stats().await;
info!("saved {} pending tsdb stat(s)", saved_tsdb);
info!("accounting and stat save loop complete");
async fn save_relational_stats(&mut self) -> usize {
let mut count = 0;
if let Some(db_conn) = self.db_conn.as_ref() {
count = self.accounting_db_buffer.len();
for (key, stat) in self.accounting_db_buffer.drain() {
// TODO: batch saves
// TODO: i don't like passing key (which came from the stat) to the function on the stat. but it works for now
if let Err(err) = stat
error!("unable to save accounting entry! err={:?}", err);
// TODO: bucket should be an enum so that we don't risk typos
async fn save_tsdb_stats(&mut self) -> usize {
let mut count = 0;
if let Some(influxdb_client) = self.influxdb_client.as_ref() {
let influxdb_bucket = self
.expect("if client is set, bucket must be set");
// TODO: use stream::iter properly to avoid allocating this Vec
let mut points = vec![];
for (key, stat) in self.global_timeseries_buffer.drain() {
// TODO: i don't like passing key (which came from the stat) to the function on the stat. but it works for now
match stat
.build_timeseries_point("global_proxy", self.chain_id, key)
Ok(point) => {
Err(err) => {
error!("unable to build global stat! err={:?}", err);
for (key, stat) in self.opt_in_timeseries_buffer.drain() {
// TODO: i don't like passing key (which came from the stat) to the function on the stat. but it works for now
match stat
.build_timeseries_point("opt_in_proxy", self.chain_id, key)
Ok(point) => {
Err(err) => {
// TODO: if this errors, we throw away some of the pending stats! we should probably buffer them somewhere to be tried again
error!("unable to build opt-in stat! err={:?}", err);
count = points.len();
if count > 0 {
// TODO: put max_batch_size in config?
// TODO: i think the real limit is the byte size of the http request. so, a simple line count won't work very well
let max_batch_size = 100;
let mut num_left = count;
while num_left > 0 {
let batch_size = num_left.min(max_batch_size);
// TODO: there has to be a better way to chunk this up. chunk on the stream with the stream being an iter?
let p = points.split_off(batch_size);
num_left -= batch_size;
if let Err(err) = influxdb_client
// TODO: if this errors, we throw away some of the pending stats! we should probably buffer them somewhere to be tried again
error!("unable to save {} tsdb stats! err={:?}", batch_size, err);
points = p;