Skip to content

Commit

Permalink
HADOOP-19096. [ABFS] [CST Optimization] Enhance Client-Side Throttlin…
Browse files Browse the repository at this point in the history
…g Metrics Logic (apache#6276)


ABFS has a client-side throttling mechanism which works on the metrics collected
from past requests

When requests are fail due to server-side throttling it updates its
metrics and recalculates any client side backoff.

The choice of which requests should be used to compute client side
backoff interval is based on the http status code:

- Status code in 2xx range: Successful Operations should contribute.
- Status code in 3xx range: Redirection Operations should not contribute.
- Status code in 4xx range: User Errors should not contribute.
- Status code is 503: Throttling Error should contribute only if they
  are due to client limits breach as follows:
  * 503, Ingress Over Account Limit: Should Contribute
  * 503, Egress Over Account Limit: Should Contribute
  * 503, TPS Over Account Limit: Should Contribute
  * 503, Other Server Throttling: Should not Contribute.
- Status code in 5xx range other than 503: Should not Contribute.
- IOException and UnknownHostExceptions: Should not Contribute.

Contributed by Anuj Modi
  • Loading branch information
anujmodi2021 authored Apr 10, 2024
1 parent 281e2d2 commit dbe2d61
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,14 @@ public enum AzureServiceErrorCode {
INVALID_SOURCE_OR_DESTINATION_RESOURCE_TYPE("InvalidSourceOrDestinationResourceType", HttpURLConnection.HTTP_CONFLICT, null),
RENAME_DESTINATION_PARENT_PATH_NOT_FOUND("RenameDestinationParentPathNotFound", HttpURLConnection.HTTP_NOT_FOUND, null),
INVALID_RENAME_SOURCE_PATH("InvalidRenameSourcePath", HttpURLConnection.HTTP_CONFLICT, null),
INGRESS_OVER_ACCOUNT_LIMIT(null, HttpURLConnection.HTTP_UNAVAILABLE, "Ingress is over the account limit."),
EGRESS_OVER_ACCOUNT_LIMIT(null, HttpURLConnection.HTTP_UNAVAILABLE, "Egress is over the account limit."),
INGRESS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
"Ingress is over the account limit."),
EGRESS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
"Egress is over the account limit."),
TPS_OVER_ACCOUNT_LIMIT("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
"Operations per second is over the account limit."),
OTHER_SERVER_THROTTLING("ServerBusy", HttpURLConnection.HTTP_UNAVAILABLE,
"The server is currently unable to receive requests. Please retry your request."),
INVALID_QUERY_PARAMETER_VALUE("InvalidQueryParameterValue", HttpURLConnection.HTTP_BAD_REQUEST, null),
AUTHORIZATION_PERMISSION_MISS_MATCH("AuthorizationPermissionMismatch", HttpURLConnection.HTTP_FORBIDDEN, null),
ACCOUNT_REQUIRES_HTTPS("AccountRequiresHttps", HttpURLConnection.HTTP_BAD_REQUEST, null),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ private AbfsClient(final URL baseUrl,
DelegatingSSLSocketFactory.initializeDefaultFactory(this.abfsConfiguration.getPreferredSSLFactoryOption());
sslProviderName = DelegatingSSLSocketFactory.getDefaultFactory().getProviderName();
} catch (IOException e) {
// Suppress exception. Failure to init DelegatingSSLSocketFactory would have only performance impact.
// Suppress exception, failure to init DelegatingSSLSocketFactory would have only performance impact.
LOG.trace("NonCritFailure: DelegatingSSLSocketFactory Init failed : "
+ "{}", e.getMessage());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;

import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_CONTINUE;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.TPS_LIMIT_BREACH_ABBREVIATION;

/**
* The AbfsRestOperation for Rest AbfsClient.
Expand Down Expand Up @@ -283,7 +285,8 @@ String getClientLatency() {
private boolean executeHttpOperation(final int retryCount,
TracingContext tracingContext) throws AzureBlobFileSystemException {
AbfsHttpOperation httpOperation;
boolean wasIOExceptionThrown = false;
// Used to avoid CST Metric Update in Case of UnknownHost/IO Exception.
boolean wasKnownExceptionThrown = false;

try {
// initialize the HTTP request and open the connection
Expand Down Expand Up @@ -321,7 +324,27 @@ private boolean executeHttpOperation(final int retryCount,
} else if (httpOperation.getStatusCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
incrementCounter(AbfsStatistic.SERVER_UNAVAILABLE, 1);
}

// If no exception occurred till here it means http operation was successfully complete and
// a response from server has been received which might be failure or success.
// If any kind of exception has occurred it will be caught below.
// If request failed to determine failure reason and retry policy here.
// else simply return with success after saving the result.
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);

int status = httpOperation.getStatusCode();
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
retryPolicy = client.getRetryPolicy(failureReason);

if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) {
return false;
}

// If the request has succeeded or failed with non-retrial error, save the operation and return.
result = httpOperation;

} catch (UnknownHostException ex) {
wasKnownExceptionThrown = true;
String hostname = null;
hostname = httpOperation.getHost();
failureReason = RetryReason.getAbbreviation(ex, null, null);
Expand All @@ -333,57 +356,27 @@ private boolean executeHttpOperation(final int retryCount,
}
return false;
} catch (IOException ex) {
wasKnownExceptionThrown = true;
if (LOG.isDebugEnabled()) {
LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex);
}

failureReason = RetryReason.getAbbreviation(ex, -1, "");
retryPolicy = client.getRetryPolicy(failureReason);
wasIOExceptionThrown = true;
if (!retryPolicy.shouldRetry(retryCount, -1)) {
throw new InvalidAbfsRestOperationException(ex, retryCount);
}

return false;
} finally {
int status = httpOperation.getStatusCode();
/*
A status less than 300 (2xx range) or greater than or equal
to 500 (5xx range) should contribute to throttling metrics being updated.
Less than 200 or greater than or equal to 500 show failed operations. 2xx
range contributes to successful operations. 3xx range is for redirects
and 4xx range is for user errors. These should not be a part of
throttling backoff computation.
*/
boolean updateMetricsResponseCode = (status < HttpURLConnection.HTTP_MULT_CHOICE
|| status >= HttpURLConnection.HTTP_INTERNAL_ERROR);

/*
Connection Timeout failures should not contribute to throttling
In case the current request fails with Connection Timeout we will have
ioExceptionThrown true and failure reason as CT
In case the current request failed with 5xx, failure reason will be
updated after finally block but wasIOExceptionThrown will be false;
*/
boolean isCTFailure = CONNECTION_TIMEOUT_ABBREVIATION.equals(failureReason) && wasIOExceptionThrown;

if (updateMetricsResponseCode && !isCTFailure) {
int statusCode = httpOperation.getStatusCode();
// Update Metrics only if Succeeded or Throttled due to account limits.
// Also Update in case of any unhandled exception is thrown.
if (shouldUpdateCSTMetrics(statusCode) && !wasKnownExceptionThrown) {
intercept.updateMetrics(operationType, httpOperation);
}
}

LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);

int status = httpOperation.getStatusCode();
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
retryPolicy = client.getRetryPolicy(failureReason);

if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) {
return false;
}

result = httpOperation;

return true;
}

Expand Down Expand Up @@ -443,6 +436,34 @@ private void incrementCounter(AbfsStatistic statistic, long value) {
}
}

/**
* Updating Client Side Throttling Metrics for relevant response status codes.
* Following criteria is used to decide based on status code and failure reason.
* <ol>
* <li>Case 1: Status code in 2xx range: Successful Operations should contribute</li>
* <li>Case 2: Status code in 3xx range: Redirection Operations should not contribute</li>
* <li>Case 3: Status code in 4xx range: User Errors should not contribute</li>
* <li>
* Case 4: Status code is 503: Throttling Error should contribute as following:
* <ol>
* <li>Case 4.a: Ingress Over Account Limit: Should Contribute</li>
* <li>Case 4.b: Egress Over Account Limit: Should Contribute</li>
* <li>Case 4.c: TPS Over Account Limit: Should Contribute</li>
* <li>Case 4.d: Other Server Throttling: Should not contribute</li>
* </ol>
* </li>
* <li>Case 5: Status code in 5xx range other than 503: Should not contribute</li>
* </ol>
* @param statusCode
* @return
*/
private boolean shouldUpdateCSTMetrics(final int statusCode) {
return statusCode < HttpURLConnection.HTTP_MULT_CHOICE // Case 1
|| INGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.a
|| EGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.b
|| TPS_LIMIT_BREACH_ABBREVIATION.equals(failureReason); // Case 4.c
}

/**
* Creates a new Tracing context before entering the retry loop of a rest operation.
* This will ensure all rest operations have unique
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ private RetryReasonConstants() {
public static final String CONNECTION_TIMEOUT_JDK_MESSAGE = "connect timed out";
public static final String READ_TIMEOUT_JDK_MESSAGE = "Read timed out";
public static final String CONNECTION_RESET_MESSAGE = "Connection reset";
public static final String OPERATION_BREACH_MESSAGE = "Operations per second is over the account limit.";
public static final String CONNECTION_RESET_ABBREVIATION = "CR";
public static final String CONNECTION_TIMEOUT_ABBREVIATION = "CT";
public static final String READ_TIMEOUT_ABBREVIATION = "RT";
public static final String INGRESS_LIMIT_BREACH_ABBREVIATION = "ING";
public static final String EGRESS_LIMIT_BREACH_ABBREVIATION = "EGR";
public static final String OPERATION_LIMIT_BREACH_ABBREVIATION = "OPR";
public static final String TPS_LIMIT_BREACH_ABBREVIATION = "OPR";
public static final String OTHER_SERVER_THROTTLING_ABBREVIATION = "OTH";
public static final String UNKNOWN_HOST_EXCEPTION_ABBREVIATION = "UH";
public static final String IO_EXCEPTION_ABBREVIATION = "IOE";
public static final String SOCKET_EXCEPTION_ABBREVIATION = "SE";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT;
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.EGRESS_OVER_ACCOUNT_LIMIT;
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.INGRESS_OVER_ACCOUNT_LIMIT;
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.OTHER_SERVER_THROTTLING;
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.TPS_OVER_ACCOUNT_LIMIT;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_BREACH_MESSAGE;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.TPS_LIMIT_BREACH_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OTHER_SERVER_THROTTLING_ABBREVIATION;

/**
* Category that can capture server-response errors for 5XX status-code.
Expand Down Expand Up @@ -56,9 +58,13 @@ String getAbbreviation(final Integer statusCode,
splitedServerErrorMessage)) {
return EGRESS_LIMIT_BREACH_ABBREVIATION;
}
if (OPERATION_BREACH_MESSAGE.equalsIgnoreCase(
if (TPS_OVER_ACCOUNT_LIMIT.getErrorMessage().equalsIgnoreCase(
splitedServerErrorMessage)) {
return OPERATION_LIMIT_BREACH_ABBREVIATION;
return TPS_LIMIT_BREACH_ABBREVIATION;
}
if (OTHER_SERVER_THROTTLING.getErrorMessage().equalsIgnoreCase(
splitedServerErrorMessage)) {
return OTHER_SERVER_THROTTLING_ABBREVIATION;
}
return HTTP_UNAVAILABLE + "";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import static org.apache.hadoop.fs.azurebfs.constants.HttpQueryParams.QUERY_PARAM_POSITION;
import static org.apache.hadoop.fs.azurebfs.constants.TestConfigurationKeys.FS_AZURE_ABFS_ACCOUNT_NAME;
import static org.apache.hadoop.fs.azurebfs.constants.TestConfigurationKeys.TEST_CONFIGURATION_FILE_NAME;
import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.EGRESS_OVER_ACCOUNT_LIMIT;
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
Expand Down Expand Up @@ -233,6 +234,11 @@ private AbfsRestOperation getRestOperation() throws Exception {
// mocked the response code and the response message to check different
// behaviour based on response code.
Mockito.doReturn(responseCode).when(abfsHttpOperation).getConnResponseCode();
if (responseCode == HTTP_UNAVAILABLE) {
Mockito.doReturn(EGRESS_OVER_ACCOUNT_LIMIT.getErrorMessage())
.when(abfsHttpOperation)
.getStorageErrorMessage();
}
Mockito.doReturn(responseMessage)
.when(abfsHttpOperation)
.getConnResponseMessage();
Expand Down
Loading

0 comments on commit dbe2d61

Please sign in to comment.