Skip to content

Commit

Permalink
added knobs so that the client failure monitoring update rate and the…
Browse files Browse the repository at this point in the history
… server failure monitoring update rate are separate knobs
  • Loading branch information
etschannen committed Dec 1, 2017
1 parent c3918d8 commit 482ac38
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 7 deletions.
1 change: 1 addition & 0 deletions fdbclient/Knobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ ClientKnobs::ClientKnobs(bool randomize) {
init( FAILURE_MAX_DELAY, 10.0 ); if( randomize && BUGGIFY ) FAILURE_MAX_DELAY = 5.0;
init( FAILURE_MIN_DELAY, 5.0 ); if( randomize && BUGGIFY ) FAILURE_MIN_DELAY = 2.0;
init( FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY );
init( CLIENT_FAILURE_TIMEOUT_DELAY, FAILURE_MIN_DELAY );

// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin

Expand Down
1 change: 1 addition & 0 deletions fdbclient/Knobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ClientKnobs : public Knobs {
double FAILURE_MAX_DELAY;
double FAILURE_MIN_DELAY;
double FAILURE_TIMEOUT_DELAY;
double CLIENT_FAILURE_TIMEOUT_DELAY;

// wrong_shard_server sometimes comes from the only nonfailed server, so we need to avoid a fast spin
double WRONG_SHARD_SERVER_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
Expand Down
4 changes: 3 additions & 1 deletion fdbrpc/FailureMonitor.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ ACTOR Future<Void> waitForContinuousFailure( IFailureMonitor* monitor, Endpoint

// X == sustainedFailureDuration + slope * (now()-startT+X)
double waitDelay = (sustainedFailureDuration + slope * (now()-startT)) / (1-slope);
if(waitDelay < FLOW_KNOBS->CLIENT_REQUEST_INTERVAL) //We will not get a failure monitoring update in this amount of time, so there is no point in waiting for changes

//SOMEDAY: if we know that this process is a server or client we can tune this optimization better
if(waitDelay < std::min(FLOW_KNOBS->CLIENT_REQUEST_INTERVAL, FLOW_KNOBS->SERVER_REQUEST_INTERVAL)) //We will not get a failure monitoring update in this amount of time, so there is no point in waiting for changes
waitDelay = 0;
choose {
when (Void _ = wait( monitor->onStateEqual( endpoint, FailureStatus(false) ) )) {} // SOMEDAY: Use onStateChanged() for efficiency
Expand Down
16 changes: 10 additions & 6 deletions fdbserver/ClusterController.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,6 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMo
state std::deque<SystemFailureStatus> statusHistory; // The last change in statusHistory is from currentVersion-1 to currentVersion
state Future<Void> periodically = Void();
state double lastT = 0;
state double clientRequestInterval = FLOW_KNOBS->CLIENT_REQUEST_INTERVAL;

loop choose {
when ( FailureMonitoringRequest req = waitNext( requests ) ) {
Expand Down Expand Up @@ -1150,8 +1149,13 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMo
TEST(true); // failureDetectionServer sending failure data to requester
FailureMonitoringReply reply;
reply.failureInformationVersion = currentVersion;
reply.clientRequestIntervalMS = clientRequestInterval * 1000;
reply.considerServerFailedTimeoutMS = CLIENT_KNOBS->FAILURE_TIMEOUT_DELAY * 1000;
if( req.senderStatus.present() ) {
reply.clientRequestIntervalMS = FLOW_KNOBS->SERVER_REQUEST_INTERVAL * 1000;
reply.considerServerFailedTimeoutMS = CLIENT_KNOBS->FAILURE_TIMEOUT_DELAY * 1000;
} else {
reply.clientRequestIntervalMS = FLOW_KNOBS->CLIENT_REQUEST_INTERVAL * 1000;
reply.considerServerFailedTimeoutMS = CLIENT_KNOBS->CLIENT_FAILURE_TIMEOUT_DELAY * 1000;
}

ASSERT( currentVersion >= (int64_t)statusHistory.size());

Expand All @@ -1173,7 +1177,7 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMo
}
}
when ( Void _ = wait( periodically ) ) {
periodically = delay( FLOW_KNOBS->CLIENT_REQUEST_INTERVAL );
periodically = delay( FLOW_KNOBS->SERVER_REQUEST_INTERVAL );
double t = now();
if (lastT != 0 && t - lastT > 1)
TraceEvent("LongDelayOnClusterController").detail("Duration", t - lastT);
Expand All @@ -1192,15 +1196,15 @@ ACTOR Future<Void> failureDetectionServer( UID uniqueID, FutureStream< FailureMo
std::nth_element(delays.begin(), delays.begin()+pivot, delays.end());
pivotDelay = *(delays.begin()+pivot);
}
pivotDelay = std::max(0.0, pivotDelay - clientRequestInterval);
pivotDelay = std::max(0.0, pivotDelay - FLOW_KNOBS->SERVER_REQUEST_INTERVAL);

TraceEvent("FailureDetectionPoll", uniqueID).detail("PivotDelay", pivotDelay).detail("Clients", currentStatus.size());
//TraceEvent("FailureDetectionAcceptableDelay").detail("ms", acceptableDelay*1000);

for(auto it = currentStatus.begin(); it != currentStatus.end(); ) {
double delay = t - it->second.lastRequestTime;

if ( it->first != g_network->getLocalAddress() && ( delay > pivotDelay * 2 + clientRequestInterval + CLIENT_KNOBS->FAILURE_MIN_DELAY || delay > CLIENT_KNOBS->FAILURE_MAX_DELAY ) ) {
if ( it->first != g_network->getLocalAddress() && ( delay > pivotDelay * 2 + FLOW_KNOBS->SERVER_REQUEST_INTERVAL + CLIENT_KNOBS->FAILURE_MIN_DELAY || delay > CLIENT_KNOBS->FAILURE_MAX_DELAY ) ) {
//printf("Failure Detection Server: Status of '%s' is now '%s' after %f sec\n", it->first.toString().c_str(), "Failed", now() - it->second.lastRequestTime);
TraceEvent("FailureDetectionStatus", uniqueID).detail("System", it->first).detail("Status","Failed").detail("Why", "Timeout").detail("LastRequestAge", delay)
.detail("PivotDelay", pivotDelay);
Expand Down
1 change: 1 addition & 0 deletions flow/Knobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {
init( DELAY_JITTER_RANGE, 0.2 );
init( BUSY_WAIT_THRESHOLD, 0 ); // 1e100 == never sleep
init( CLIENT_REQUEST_INTERVAL, 0.1 ); if( randomize && BUGGIFY ) CLIENT_REQUEST_INTERVAL = 1.0;
init( SERVER_REQUEST_INTERVAL, 0.1 ); if( randomize && BUGGIFY ) SERVER_REQUEST_INTERVAL = 1.0;

init( REACTOR_FLAGS, 0 );

Expand Down
1 change: 1 addition & 0 deletions flow/Knobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class FlowKnobs : public Knobs {
double DELAY_JITTER_RANGE;
double BUSY_WAIT_THRESHOLD;
double CLIENT_REQUEST_INTERVAL;
double SERVER_REQUEST_INTERVAL;

int DISABLE_ASSERTS;
double QUEUE_MODEL_SMOOTHING_AMOUNT;
Expand Down

0 comments on commit 482ac38

Please sign in to comment.