Unverified Commit 0d5f3d9c authored by weisd's avatar weisd Committed by GitHub
Browse files

Enhanced Checksum Support and Content Validation (#371)



* add content md5,sha256 check

* fix(s3s-fs):add  checksum_crc64nvme

* Update crates/s3s-e2e/src/basic.rs

Co-authored-by: default avatarCopilot <175728472+Copilot@users.noreply.github.com>

* Update crates/s3s-e2e/Cargo.toml

Co-authored-by: default avatarCopilot <175728472+Copilot@users.noreply.github.com>

* Update crates/s3s-e2e/Cargo.toml

Co-authored-by: default avatarCopilot <175728472+Copilot@users.noreply.github.com>

* fix(s3s-fs):add  checksum_crc64nvme

* fix(s3s-e2e):multipart upload remove checksum

* fix(s3s-e2e):multipart upload remove checksum

---------

Co-authored-by: default avatarCopilot <175728472+Copilot@users.noreply.github.com>
parent 4ba8c5c6
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
/target

.DS_Store
.vscode
.idea
__pycache__/
+9 −0
Original line number Diff line number Diff line
@@ -1976,6 +1976,12 @@ dependencies = [
 "digest 0.10.7",
]

[[package]]
name = "md5"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

[[package]]
name = "memchr"
version = "2.7.6"
@@ -2838,10 +2844,13 @@ dependencies = [
 "aws-credential-types",
 "aws-sdk-s3",
 "aws-sdk-sts",
 "base64 0.22.1",
 "base64-simd",
 "bytes",
 "futures",
 "http-body 1.0.1",
 "http-body-util",
 "md5",
 "s3s-test",
 "tracing",
]
+4 −0
Original line number Diff line number Diff line
@@ -23,6 +23,10 @@ http-body-util = "0.1.3"
futures = { version = "0.3.31", default-features = false }
bytes = "1.10.1"
http-body = "1.0.1"
md5 = "0.7.0"
base64 = "0.22.0"
base64-simd = "0.8.0"


[dependencies.aws-config]
version = "1.8.7"
+150 −29
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@ pub fn register(tcx: &mut TestContext) {
    case!(tcx, Basic, Put, test_put_object_with_metadata);
    case!(tcx, Basic, Put, test_put_object_larger);
    case!(tcx, Basic, Put, test_put_object_with_checksum_algorithm);
    case!(tcx, Basic, Put, test_put_object_with_content_checksums);
    case!(tcx, Basic, Copy, test_copy_object);
}

@@ -389,6 +390,13 @@ impl Put {
        let bucket = self.bucket.as_str();
        let key = "with-checksum-trailer";

        for checksum_algorithm in [
            ChecksumAlgorithm::Crc32,
            ChecksumAlgorithm::Crc32C,
            ChecksumAlgorithm::Sha1,
            ChecksumAlgorithm::Sha256,
            ChecksumAlgorithm::Crc64Nvme,
        ] {
            let body = {
                let bytes = Bytes::from_static(&[b'a'; 1024]);

@@ -405,16 +413,31 @@ impl Put {
                .put_object()
                .bucket(bucket)
                .key(key)
            .checksum_algorithm(ChecksumAlgorithm::Crc32)
                .checksum_algorithm(checksum_algorithm.clone())
                .body(body)
                .send()
                .await?;

        let put_crc32 = put_resp
            let put_resp_checksum = match checksum_algorithm {
                ChecksumAlgorithm::Crc32 => put_resp
                    .checksum_crc32()
            .expect("PUT should return checksum when checksum_algorithm is used");
                    .expect("PUT should return checksum when checksum_algorithm is used"),
                ChecksumAlgorithm::Crc32C => put_resp
                    .checksum_crc32_c()
                    .expect("PUT should return checksum when checksum_algorithm is used"),
                ChecksumAlgorithm::Sha1 => put_resp
                    .checksum_sha1()
                    .expect("PUT should return checksum when checksum_algorithm is used"),
                ChecksumAlgorithm::Sha256 => put_resp
                    .checksum_sha256()
                    .expect("PUT should return checksum when checksum_algorithm is used"),
                ChecksumAlgorithm::Crc64Nvme => put_resp
                    .checksum_crc64_nvme()
                    .expect("PUT should return checksum when checksum_algorithm is used"),
                _ => panic!("Unsupported checksum algorithm"),
            };

        let resp = s3
            let mut resp = s3
                .get_object()
                .bucket(bucket)
                .key(key)
@@ -422,16 +445,114 @@ impl Put {
                .send()
                .await?;

        let get_crc32 = resp
            .checksum_crc32()
            .expect("GET should return checksum when checksum_mode is enabled and full object is returned")
            .to_owned();
            let body = std::mem::replace(&mut resp.body, ByteStream::new(SdkBody::empty()))
                .collect()
                .await?;
            let body = String::from_utf8(body.to_vec())?;
            assert_eq!(body, "a".repeat(70 * 1024));

            let get_resp_checksum = match checksum_algorithm {
                ChecksumAlgorithm::Crc32 => resp.checksum_crc32(),
                ChecksumAlgorithm::Crc32C => resp.checksum_crc32_c(),
                ChecksumAlgorithm::Sha1 => resp.checksum_sha1(),
                ChecksumAlgorithm::Sha256 => resp.checksum_sha256(),
                ChecksumAlgorithm::Crc64Nvme => resp.checksum_crc64_nvme(),
                _ => panic!("Unsupported checksum algorithm"),
            };

            assert_eq!(get_resp_checksum, Some(put_resp_checksum));
        }

        Ok(())
    }

    async fn test_put_object_with_content_checksums(self: Arc<Self>) -> Result {
        let s3 = &self.s3;
        let bucket = self.bucket.as_str();
        let key = "file-with-content-checksums";

        // Create test content
        let content = "Hello, World! This is a test content for checksum validation. 你好世界!";
        let content_bytes = content.as_bytes();

        // Calculate MD5 hash
        let md5_digest = md5::compute(content_bytes);
        let md5_hash = base64_simd::STANDARD.encode_to_string(md5_digest.as_ref());

        // Test with Content-MD5
        s3.put_object()
            .bucket(bucket)
            .key(format!("{key}-md5"))
            .body(ByteStream::from_static(content_bytes))
            .content_md5(&md5_hash)
            .send()
            .await?;

        // Test with different content sizes and MD5
        let large_content = "x".repeat(2048);
        let large_md5_digest = md5::compute(large_content.as_bytes());
        let large_md5_hash = base64_simd::STANDARD.encode_to_string(large_md5_digest.as_ref());

        s3.put_object()
            .bucket(bucket)
            .key(format!("{key}-large"))
            .body(ByteStream::from(large_content.clone().into_bytes()))
            .content_md5(&large_md5_hash)
            .send()
            .await?;

        // Test with empty content and MD5
        let empty_content = "";
        let empty_md5_digest = md5::compute(empty_content.as_bytes());
        let empty_md5_hash = base64_simd::STANDARD.encode_to_string(empty_md5_digest.as_ref());

        s3.put_object()
            .bucket(bucket)
            .key(format!("{key}-empty"))
            .body(ByteStream::from_static(empty_content.as_bytes()))
            .content_md5(&empty_md5_hash)
            .send()
            .await?;

        // Verify all objects were uploaded correctly
        for (suffix, expected_content) in [("md5", content), ("large", &large_content), ("empty", empty_content)] {
            let resp = s3.get_object().bucket(bucket).key(format!("{key}-{suffix}")).send().await?;

            let body = resp.body.collect().await?;
            let body = String::from_utf8(body.to_vec())?;
        assert_eq!(body, "a".repeat(70 * 1024));
            assert_eq!(body, expected_content);
        }

        // Test with incorrect MD5 (should fail)
        let incorrect_md5 = base64_simd::STANDARD.encode_to_string(b"incorrect_md5_hash");
        let result = s3
            .put_object()
            .bucket(bucket)
            .key(format!("{key}-incorrect-md5"))
            .body(ByteStream::from_static(content_bytes))
            .content_md5(&incorrect_md5)
            .send()
            .await;

        // This should fail with a checksum mismatch error
        assert!(result.is_err(), "Expected checksum mismatch error for incorrect MD5");

        // Test with correct MD5 but wrong content (should fail)
        let wrong_content = "This is different content";
        let wrong_md5_digest = md5::compute(wrong_content.as_bytes());
        let wrong_md5_hash = base64_simd::STANDARD.encode_to_string(wrong_md5_digest.as_ref());

        let result = s3
            .put_object()
            .bucket(bucket)
            .key(format!("{key}-wrong-content"))
            .body(ByteStream::from_static(content_bytes)) // Using original content
            .content_md5(&wrong_md5_hash) // But wrong MD5
            .send()
            .await;

        assert_eq!(get_crc32, put_crc32);
        // This should also fail
        assert!(result.is_err(), "Expected checksum mismatch error for wrong content with correct MD5");

        Ok(())
    }
+8 −0
Original line number Diff line number Diff line
@@ -15,6 +15,10 @@ pub fn modify_internal_info(info: &mut serde_json::Map<String, serde_json::Value
    if let Some(checksum_sha256) = &checksum.checksum_sha256 {
        info.insert("checksum_sha256".to_owned(), serde_json::Value::String(checksum_sha256.clone()));
    }

    if let Some(checksum_crc64nvme) = &checksum.checksum_crc64nvme {
        info.insert("checksum_crc64nvme".to_owned(), serde_json::Value::String(checksum_crc64nvme.clone()));
    }
}

pub fn from_internal_info(info: &InternalInfo) -> s3s::dto::Checksum {
@@ -31,5 +35,9 @@ pub fn from_internal_info(info: &InternalInfo) -> s3s::dto::Checksum {
    if let Some(checksum_sha256) = info.get("checksum_sha256") {
        ans.checksum_sha256 = Some(checksum_sha256.as_str().unwrap().to_owned());
    }

    if let Some(checksum_crc64nvme) = info.get("checksum_crc64nvme") {
        ans.checksum_crc64nvme = Some(checksum_crc64nvme.as_str().unwrap().to_owned());
    }
    ans
}
Loading