wait_for_sync was waiting too long

This commit is contained in:
Bryan Stitt 2023-10-10 17:15:03 -07:00
parent 0e2ed425c1
commit 6906eee0dd

View File

@ -21,7 +21,7 @@ use std::sync::{atomic, Arc};
use std::time::Duration; use std::time::Duration;
use tokio::select; use tokio::select;
use tokio::task::yield_now; use tokio::task::yield_now;
use tokio::time::{sleep_until, Instant}; use tokio::time::{sleep, sleep_until, Instant};
use tracing::{debug, enabled, info, trace, warn, Level}; use tracing::{debug, enabled, info, trace, warn, Level};
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -92,7 +92,8 @@ impl PartialOrd for RpcRanking {
pub enum ShouldWaitForBlock { pub enum ShouldWaitForBlock {
Ready, Ready,
// BackupReady, // BackupReady,
Wait { current: Option<U64> }, /// how many blocks you will have to wait
Wait(U64),
// WaitForBackup { current: Option<U64> }, // WaitForBackup { current: Option<U64> },
NeverReady, NeverReady,
} }
@ -337,6 +338,10 @@ impl RankedRpcs {
min_block_num: Option<U64>, min_block_num: Option<U64>,
max_block_num: Option<U64>, max_block_num: Option<U64>,
) -> ShouldWaitForBlock { ) -> ShouldWaitForBlock {
if !rpc.healthy.load(atomic::Ordering::Relaxed) {
return ShouldWaitForBlock::NeverReady;
}
if let Some(min_block_num) = min_block_num { if let Some(min_block_num) = min_block_num {
if !self.has_block_data(rpc, min_block_num) { if !self.has_block_data(rpc, min_block_num) {
trace!( trace!(
@ -354,9 +359,9 @@ impl RankedRpcs {
Ordering::Less => { Ordering::Less => {
trace!("{} is behind. let it catch up", rpc); trace!("{} is behind. let it catch up", rpc);
// TODO: what if this is a pruned rpc that is behind by a lot, and the block is old, too? // TODO: what if this is a pruned rpc that is behind by a lot, and the block is old, too?
return ShouldWaitForBlock::Wait { return ShouldWaitForBlock::Wait(
current: Some(rpc_data.head_block_num), needed_block_num - rpc_data.head_block_num,
}; );
} }
Ordering::Greater | Ordering::Equal => { Ordering::Greater | Ordering::Equal => {
// rpc is synced past the needed block. make sure the block isn't too old for it // rpc is synced past the needed block. make sure the block isn't too old for it
@ -391,6 +396,10 @@ impl RankedRpcs {
return false; return false;
} }
if !rpc.healthy.load(atomic::Ordering::Relaxed) {
return false;
}
if let Some(min_block_needed) = min_block_needed { if let Some(min_block_needed) = min_block_needed {
if !self.has_block_data(rpc, min_block_needed) { if !self.has_block_data(rpc, min_block_needed) {
trace!( trace!(
@ -1015,13 +1024,15 @@ impl RpcsForRequest {
// TODO: do this without having 3 Vecs // TODO: do this without having 3 Vecs
let mut filtered = Vec::with_capacity(max_len); let mut filtered = Vec::with_capacity(max_len);
let mut attempted = Vec::with_capacity(max_len); let mut attempted = HashSet::with_capacity(max_len);
let mut failed = Vec::with_capacity(max_len); let mut completed = HashSet::with_capacity(max_len);
// todo!("be sure to set server_error if we exit without any rpcs!"); // todo!("be sure to set server_error if we exit without any rpcs!");
loop { loop {
if self.request.connect_timeout() { if self.request.connect_timeout() {
break; break;
} else {
yield_now().await;
} }
let mut earliest_retry_at = None; let mut earliest_retry_at = None;
@ -1032,11 +1043,11 @@ impl RpcsForRequest {
attempted.clear(); attempted.clear();
while attempted.len() + failed.len() < rpcs.len() { while attempted.len() + completed.len() < rpcs.len() {
filtered.clear(); filtered.clear();
// TODO: i'd like to do this without the collect, but since we push into `attempted`, having `attempted.contains` causes issues // TODO: i'd like to do this without the collect, but since we push into `attempted`, having `attempted.contains` causes issues
filtered.extend(rpcs.iter().filter(|x| !(attempted.contains(x) || failed.contains(x)))); filtered.extend(rpcs.iter().filter(|x| !(attempted.contains(x) || completed.contains(x))));
// tuple_windows doesn't do anything for single item iters. make the code DRY by just having it compare itself // tuple_windows doesn't do anything for single item iters. make the code DRY by just having it compare itself
if filtered.len() == 1 { if filtered.len() == 1 {
@ -1052,14 +1063,13 @@ impl RpcsForRequest {
// TODO: should x.next_available should be Reverse<_>? // TODO: should x.next_available should be Reverse<_>?
let best_rpc = best_rpc(rpc_a, rpc_b); let best_rpc = best_rpc(rpc_a, rpc_b);
attempted.push(best_rpc);
match best_rpc match best_rpc
.try_request_handle(&self.request, error_handler, false) .try_request_handle(&self.request, error_handler, false)
.await .await
{ {
Ok(OpenRequestResult::Handle(handle)) => { Ok(OpenRequestResult::Handle(handle)) => {
trace!("opened handle: {}", best_rpc); trace!("opened handle: {}", best_rpc);
completed.insert(best_rpc);
yield handle; yield handle;
} }
Ok(OpenRequestResult::RetryAt(retry_at)) => { Ok(OpenRequestResult::RetryAt(retry_at)) => {
@ -1068,13 +1078,14 @@ impl RpcsForRequest {
best_rpc, best_rpc,
retry_at.duration_since(Instant::now()).as_secs_f32() retry_at.duration_since(Instant::now()).as_secs_f32()
); );
attempted.insert(best_rpc);
earliest_retry_at = earliest_retry_at.min(Some(retry_at)); earliest_retry_at = earliest_retry_at.min(Some(retry_at, ));
continue; continue;
} }
Ok(OpenRequestResult::Lagged(x)) => { Ok(OpenRequestResult::Lagged(x)) => {
trace!("{} is lagged. will not work now", best_rpc);
// this will probably always be the same block, right? // this will probably always be the same block, right?
trace!("{} is lagged. will not work now", best_rpc);
attempted.insert(best_rpc);
if wait_for_sync.is_none() { if wait_for_sync.is_none() {
wait_for_sync = Some(x); wait_for_sync = Some(x);
} }
@ -1083,34 +1094,40 @@ impl RpcsForRequest {
Ok(OpenRequestResult::Failed) => { Ok(OpenRequestResult::Failed) => {
// TODO: log a warning? emit a stat? // TODO: log a warning? emit a stat?
trace!("best_rpc not ready: {}", best_rpc); trace!("best_rpc not ready: {}", best_rpc);
attempted.pop(); completed.insert(best_rpc);
failed.push(best_rpc);
continue; continue;
} }
Err(err) => { Err(err) => {
trace!("No request handle for {}. err={:?}", best_rpc, err); trace!("No request handle for {}. err={:?}", best_rpc, err);
attempted.pop(); completed.insert(best_rpc);
failed.push(best_rpc);
continue; continue;
} }
} }
} }
debug_assert!(!attempted.is_empty()); debug_assert!(!(attempted.is_empty() && completed.is_empty()));
} }
yield_now().await;
} }
// if we got this far, no inner or outer rpcs are ready. thats suprising since an inner should have been // if we got this far, no inner or outer rpcs are ready. thats suprising since an inner should have been
// clear earliest_retry_at if it is too far in the future to help us // clear earliest_retry_at if it is too far in the future to help us
if let Some(retry_at) = earliest_retry_at { if let Some(retry_at) = earliest_retry_at {
if self.request.connect_timeout_at() <= retry_at { if retry_at > self.request.connect_timeout_at() {
// no point in waiting. it wants us to wait too long // no point in waiting. it wants us to wait too long
earliest_retry_at = None; earliest_retry_at = None;
} }
let now = Instant::now();
if retry_at < now {
warn!("this seems like a problem and not something we should just sleep on");
earliest_retry_at = Some(now + Duration::from_millis(100))
}
} }
// TODO: i think theres a bug here so i // TODO: i think theres a bug here
match (wait_for_sync, earliest_retry_at) { match (wait_for_sync, earliest_retry_at) {
(None, None) => { (None, None) => {
// we have nothing to wait for. uh oh! // we have nothing to wait for. uh oh!
@ -1118,12 +1135,7 @@ impl RpcsForRequest {
} }
(None, Some(retry_at)) => { (None, Some(retry_at)) => {
// try again after rate limits are done // try again after rate limits are done
if retry_at > Instant::now() { sleep_until(retry_at).await;
sleep_until(retry_at).await;
} else {
// TODO: why is this happening? why would we get rate limited to now? it should be like a second at minimum
yield_now().await;
}
} }
(Some(wait_for_sync), None) => { (Some(wait_for_sync), None) => {
select! { select! {
@ -1140,7 +1152,7 @@ impl RpcsForRequest {
}, },
} }
} }
_ = sleep_until(self.request.expire_at()) => { _ = sleep_until(self.request.connect_timeout_at()) => {
break; break;
} }
} }
@ -1161,8 +1173,6 @@ impl RpcsForRequest {
} }
} }
_ = sleep_until(retry_at) => { _ = sleep_until(retry_at) => {
// if sleep didn't have to wait at all, something seems wrong. have a minimum wait?
yield_now().await;
continue; continue;
} }
} }