Unverified Commit e6005fb1 authored by Zelda Hessler's avatar Zelda Hessler Committed by GitHub
Browse files

Feature: make retry strategy exponential backoff multiplier configurable (#1548)

* feature: make retry strategy backoff configurable
update: smithy client to respect configurable backoff
update: CHANGELOG.next.toml

* fix: calculate_exponential_backoff

* update: make the backoff calculation simpler
update: default backoff multiplier to be 1 second to better match previous retry behavior

* update: retry builder backoff multiplier default

* rename: backoff_multiplier to initial_backoff
update: calculate_exponential_backoff tests for readability

* remove: copypasted code

* update: CHANGELOG.next.toml
parent c6193bd3
Loading
Loading
Loading
Loading
+50 −0
Original line number Diff line number Diff line
@@ -36,3 +36,53 @@ message = "Re-export aws_types::SdkConfig in aws_config"
references = ["smithy-rs#1457"]
meta = { "breaking" = false, "tada" = true, "bug" = false }
author = "calavera"

[[smithy-rs]]
message = """
Updated the smithy client's retry behavior to allow for a configurable initial backoff. Previously, the initial backoff
(named `r` in the code) was set to 2 seconds. This is not an ideal default for services that expect clients to quickly
retry failed request attempts. Now, users can set quicker (or slower) backoffs according to their needs.
"""
references = ["aws-sdk-rust#567"]
meta = { "breaking" = false, "tada" = true, "bug" = false }
author = "Velfi"

[[aws-sdk-rust]]
message = """
Updated SDK Client retry behavior to allow for a configurable initial backoff. Previously, the initial backoff
(named `r` in the code) was set to 2 seconds. This is not an ideal default for services like DynamoDB that expect
clients to quickly retry failed request attempts. Now, users can set quicker (or slower) backoffs according to their
needs.

```rust
#[tokio::main]
async fn main() -> Result<(), aws_sdk_dynamodb::Error> {
    let retry_config = aws_smithy_types::retry::RetryConfigBuilder::new()
        .max_attempts(4)
        .initial_backoff(Duration::from_millis(20));

    let shared_config = aws_config::from_env()
        .retry_config(retry_config)
        .load()
        .await;

    let client = aws_sdk_dynamodb::Client::new(&shared_config);

    // Given the 20ms backoff multiplier, and assuming this request fails 3 times before succeeding,
    // the first retry would take place between 0-20ms after the initial request,
    // the second retry would take place between 0-40ms after the first retry,
    // and the third retry would take place between 0-80ms after the second retry.
    let request = client
        .put_item()
        .table_name("users")
        .item("username", "Velfi")
        .item("account_type", "Developer")
        .send().await?;

    Ok(())
}
```
"""
references = ["aws-sdk-rust#567"]
meta = { "breaking" = false, "tada" = true, "bug" = false }
author = "Velfi"
+83 −17
Original line number Diff line number Diff line
@@ -22,11 +22,12 @@ use std::sync::{Arc, Mutex};
use std::time::Duration;

use crate::{SdkError, SdkSuccess};

use aws_smithy_async::rt::sleep::AsyncSleep;
use aws_smithy_http::operation;
use aws_smithy_http::operation::Operation;
use aws_smithy_http::retry::ClassifyResponse;
use aws_smithy_types::retry::{ErrorKind, RetryKind};

use tracing::Instrument;

/// A policy instantiator.
@@ -57,6 +58,7 @@ pub struct Config {
    no_retry_increment: usize,
    timeout_retry_cost: usize,
    max_attempts: u32,
    initial_backoff: Duration,
    max_backoff: Duration,
    base: fn() -> f64,
}
@@ -82,6 +84,24 @@ impl Config {
        self.max_attempts = max_attempts;
        self
    }

    /// Override the default backoff multiplier of 1 second.
    ///
    /// ## Example
    ///
    /// For a request that gets retried 3 times, when initial_backoff is 1 second:
    /// - the first retry will occur after 0 to 1 seconds
    /// - the second retry will occur after 0 to 2 seconds
    /// - the third retry will occur after 0 to 4 seconds
    ///
    /// For a request that gets retried 3 times, when initial_backoff is 30 milliseconds:
    /// - the first retry will occur after 0 to 30 milliseconds
    /// - the second retry will occur after 0 to 60 milliseconds
    /// - the third retry will occur after 0 to 120 milliseconds
    pub fn with_initial_backoff(mut self, initial_backoff: Duration) -> Self {
        self.initial_backoff = initial_backoff;
        self
    }
}

impl Default for Config {
@@ -95,13 +115,16 @@ impl Default for Config {
            max_backoff: Duration::from_secs(20),
            // by default, use a random base for exponential backoff
            base: fastrand::f64,
            initial_backoff: Duration::from_secs(1),
        }
    }
}

impl From<aws_smithy_types::retry::RetryConfig> for Config {
    fn from(conf: aws_smithy_types::retry::RetryConfig) -> Self {
        Self::default().with_max_attempts(conf.max_attempts())
        Self::default()
            .with_max_attempts(conf.max_attempts())
            .with_initial_backoff(conf.initial_backoff())
    }
}

@@ -250,6 +273,19 @@ impl RetryHandler {
    }
}

/// For a request that gets retried 3 times, when base is 1 and initial_backoff is 2 seconds:
/// - the first retry will occur after 0 to 2 seconds
/// - the second retry will occur after 0 to 4 seconds
/// - the third retry will occur after 0 to 8 seconds
///
/// For a request that gets retried 3 times, when base is 1 and initial_backoff is 30 milliseconds:
/// - the first retry will occur after 0 to 30 milliseconds
/// - the second retry will occur after 0 to 60 milliseconds
/// - the third retry will occur after 0 to 120 milliseconds
fn calculate_exponential_backoff(base: f64, initial_backoff: f64, retry_attempts: u32) -> f64 {
    base * initial_backoff * 2_u32.pow(retry_attempts) as f64
}

impl RetryHandler {
    /// Determine the correct response given `retry_kind`
    ///
@@ -262,17 +298,15 @@ impl RetryHandler {
            }
            self.shared.quota_acquire(error_kind, &self.config)?
        };
        /*
        From the retry spec:
            b = random number within the range of: 0 <= b <= 1
            r = 2
            t_i = min(br^i, MAX_BACKOFF);
         */
        let r: i32 = 2;
        let b = (self.config.base)();
        let backoff = calculate_exponential_backoff(
            // Generate a random base multiplier to create jitter
            (self.config.base)(),
            // Get the backoff time multiplier in seconds (with fractional seconds)
            self.config.initial_backoff.as_secs_f64(),
            // `self.local.attempts` tracks number of requests made including the initial request
            // The initial attempt shouldn't count towards backoff calculations so we subtract it
        let backoff = b * (r.pow(self.local.attempts - 1) as f64);
            self.local.attempts - 1,
        );
        let backoff = Duration::from_secs_f64(backoff).min(self.config.max_backoff);
        let next = RetryHandler {
            local: RequestLocalRetryState {
@@ -330,8 +364,7 @@ impl RetryHandler {
    }
}

impl<Handler, R, T, E>
    tower::retry::Policy<operation::Operation<Handler, R>, SdkSuccess<T>, SdkError<E>>
impl<Handler, R, T, E> tower::retry::Policy<Operation<Handler, R>, SdkSuccess<T>, SdkError<E>>
    for RetryHandler
where
    Handler: Clone,
@@ -360,8 +393,7 @@ fn check_send<T: Send>(t: T) -> T {

#[cfg(test)]
mod test {

    use crate::retry::{Config, NewRequestPolicy, RetryHandler, Standard};
    use super::{calculate_exponential_backoff, Config, NewRequestPolicy, RetryHandler, Standard};

    use aws_smithy_types::retry::{ErrorKind, RetryKind};

@@ -502,6 +534,7 @@ mod test {
    fn max_backoff_time() {
        let mut conf = test_config();
        conf.max_attempts = 5;
        conf.initial_backoff = Duration::from_secs(1);
        conf.max_backoff = Duration::from_secs(3);
        let policy = Standard::new(conf).new_request_policy(None);
        let (policy, dur) = policy
@@ -532,4 +565,37 @@ mod test {
        assert!(no_retry.is_none());
        assert_eq!(policy.retry_quota(), 480);
    }

    #[test]
    fn calculate_exponential_backoff_where_initial_backoff_is_one() {
        let initial_backoff = 1.0;

        for (attempt, expected_backoff) in [initial_backoff, 2.0, 4.0].into_iter().enumerate() {
            let actual_backoff =
                calculate_exponential_backoff(1.0, initial_backoff, attempt as u32);
            assert_eq!(expected_backoff, actual_backoff);
        }
    }

    #[test]
    fn calculate_exponential_backoff_where_initial_backoff_is_greater_than_one() {
        let initial_backoff = 3.0;

        for (attempt, expected_backoff) in [initial_backoff, 6.0, 12.0].into_iter().enumerate() {
            let actual_backoff =
                calculate_exponential_backoff(1.0, initial_backoff, attempt as u32);
            assert_eq!(expected_backoff, actual_backoff);
        }
    }

    #[test]
    fn calculate_exponential_backoff_where_initial_backoff_is_less_than_one() {
        let initial_backoff = 0.03;

        for (attempt, expected_backoff) in [initial_backoff, 0.06, 0.12].into_iter().enumerate() {
            let actual_backoff =
                calculate_exponential_backoff(1.0, initial_backoff, attempt as u32);
            assert_eq!(expected_backoff, actual_backoff);
        }
    }
}
+6 −5
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: Apache-2.0
 */

use crate::test_operation::TestPolicy;
use crate::test_operation::{TestOperationParser, TestPolicy};
use aws_smithy_async::rt::sleep::TokioSleep;

use aws_smithy_client::test_connection::TestConnection;
@@ -14,7 +14,6 @@ use aws_smithy_http::operation::Operation;
use aws_smithy_http::result::SdkError;
use std::sync::Arc;
use std::time::Duration;
use tokio::time::Instant;
use tower::layer::util::Identity;

mod test_operation {
@@ -89,14 +88,14 @@ mod test_operation {
    }
}

fn test_operation() -> Operation<test_operation::TestOperationParser, test_operation::TestPolicy> {
fn test_operation() -> Operation<TestOperationParser, TestPolicy> {
    let req = operation::Request::new(
        http::Request::builder()
            .uri("https://test-service.test-region.amazonaws.com/")
            .body(SdkBody::from("request body"))
            .unwrap(),
    );
    Operation::new(req, test_operation::TestOperationParser).with_retry_policy(TestPolicy)
    Operation::new(req, TestOperationParser).with_retry_policy(TestPolicy)
}

#[tokio::test]
@@ -138,6 +137,8 @@ async fn end_to_end_retry_test() {
    let conn = TestConnection::new(events);
    let retry_config = aws_smithy_client::retry::Config::default()
        .with_max_attempts(4)
        // This is the default, just setting it to be explicit
        .with_initial_backoff(Duration::from_secs(1))
        .with_base(|| 1_f64);
    let client = Client::<TestConnection<_>, Identity>::new(conn.clone())
        .with_retry_config(retry_config)
@@ -174,7 +175,7 @@ async fn end_to_end_retry_test() {
/// Validate that time has passed with a 5ms tolerance
///
/// This is to account for some non-determinism in the Tokio timer
fn assert_time_passed(initial: Instant, passed: Duration) {
fn assert_time_passed(initial: tokio::time::Instant, passed: Duration) {
    let now = tokio::time::Instant::now();
    let delta = now - initial;
    if (delta.as_millis() as i128 - passed.as_millis() as i128).abs() > 5 {
+55 −8
Original line number Diff line number Diff line
@@ -134,6 +134,7 @@ impl FromStr for RetryMode {
pub struct RetryConfigBuilder {
    mode: Option<RetryMode>,
    max_attempts: Option<u32>,
    initial_backoff: Option<Duration>,
}

impl RetryConfigBuilder {
@@ -148,24 +149,36 @@ impl RetryConfigBuilder {
        self
    }

    /// Sets the max attempts. This value must be greater than zero.
    pub fn set_max_attempts(&mut self, max_attempts: Option<u32>) -> &mut Self {
        self.max_attempts = max_attempts;
        self
    }

    /// Sets the retry mode.
    pub fn mode(mut self, mode: RetryMode) -> Self {
        self.set_mode(Some(mode));
        self
    }

    /// Sets the max attempts. This value must be greater than zero.
    pub fn set_max_attempts(&mut self, max_attempts: Option<u32>) -> &mut Self {
        self.max_attempts = max_attempts;
        self
    }

    /// Sets the max attempts. This value must be greater than zero.
    pub fn max_attempts(mut self, max_attempts: u32) -> Self {
        self.set_max_attempts(Some(max_attempts));
        self
    }

    /// Set the initial_backoff duration. This duration should be non-zero.
    pub fn set_initial_backoff(&mut self, initial_backoff: Option<Duration>) -> &mut Self {
        self.initial_backoff = initial_backoff;
        self
    }

    /// Set the initial_backoff duration. This duration should be non-zero.
    pub fn initial_backoff(mut self, initial_backoff: Duration) -> Self {
        self.set_initial_backoff(Some(initial_backoff));
        self
    }

    /// Merge two builders together. Values from `other` will only be used as a fallback for values
    /// from `self` Useful for merging configs from different sources together when you want to
    /// handle "precedence" per value instead of at the config level
@@ -186,6 +199,7 @@ impl RetryConfigBuilder {
        Self {
            mode: self.mode.or(other.mode),
            max_attempts: self.max_attempts.or(other.max_attempts),
            initial_backoff: self.initial_backoff.or(other.initial_backoff),
        }
    }

@@ -194,6 +208,9 @@ impl RetryConfigBuilder {
        RetryConfig {
            mode: self.mode.unwrap_or(RetryMode::Standard),
            max_attempts: self.max_attempts.unwrap_or(3),
            initial_backoff: self
                .initial_backoff
                .unwrap_or_else(|| Duration::from_secs(1)),
        }
    }
}
@@ -204,6 +221,7 @@ impl RetryConfigBuilder {
pub struct RetryConfig {
    mode: RetryMode,
    max_attempts: u32,
    initial_backoff: Duration,
}

impl RetryConfig {
@@ -217,18 +235,41 @@ impl RetryConfig {
        Self::default().with_max_attempts(1)
    }

    /// Changes the retry mode.
    /// Set this config's [retry mode](RetryMode).
    pub fn with_retry_mode(mut self, retry_mode: RetryMode) -> Self {
        self.mode = retry_mode;
        self
    }

    /// Changes the max attempts. This value must be greater than zero.
    /// Set the maximum number of times a request should be tried, including the initial attempt.
    /// This value must be greater than zero.
    pub fn with_max_attempts(mut self, max_attempts: u32) -> Self {
        self.max_attempts = max_attempts;
        self
    }

    /// Set the multiplier used when calculating backoff times as part of an
    /// [exponential backoff with jitter](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/)
    /// strategy. Most services should work fine with the default duration of 1 second, but if you
    /// find that your requests are taking too long due to excessive retry backoff, try lowering
    /// this value.
    ///
    /// ## Example
    ///
    /// *For a request that gets retried 3 times, when initial_backoff is 1 seconds:*
    /// - the first retry will occur after 0 to 1 seconds
    /// - the second retry will occur after 0 to 2 seconds
    /// - the third retry will occur after 0 to 4 seconds
    ///
    /// *For a request that gets retried 3 times, when initial_backoff is 30 milliseconds:*
    /// - the first retry will occur after 0 to 30 milliseconds
    /// - the second retry will occur after 0 to 60 milliseconds
    /// - the third retry will occur after 0 to 120 milliseconds
    pub fn with_initial_backoff(mut self, initial_backoff: Duration) -> Self {
        self.initial_backoff = initial_backoff;
        self
    }

    /// Returns the retry mode.
    pub fn mode(&self) -> RetryMode {
        self.mode
@@ -238,6 +279,11 @@ impl RetryConfig {
    pub fn max_attempts(&self) -> u32 {
        self.max_attempts
    }

    /// Returns the backoff multiplier duration.
    pub fn initial_backoff(&self) -> Duration {
        self.initial_backoff
    }
}

impl Default for RetryConfig {
@@ -245,6 +291,7 @@ impl Default for RetryConfig {
        Self {
            mode: RetryMode::Standard,
            max_attempts: 3,
            initial_backoff: Duration::from_secs(1),
        }
    }
}