Commit cecb762b authored by Artur Paszkiewicz's avatar Artur Paszkiewicz Committed by Tomasz Zawadzki
Browse files

raid5f: degraded reads support



To handle a read of a chunk from a missing base bdev we must first read
all the other chunks of this stripe and then xor the data.

Change-Id: I1af2ffc3fe1f41b798e15b5194ab5695923737ef
Signed-off-by: default avatarArtur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15270


Reviewed-by: default avatarJim Harris <jim.harris@gmail.com>
Reviewed-by: default avatarKrzysztof Karas <krzysztof.karas@intel.com>
Community-CI: Mellanox Build Bot
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <smatsumoto@nvidia.com>
parent 84dd9c3a
Loading
Loading
Loading
Loading
+165 −20
Original line number Diff line number Diff line
@@ -71,6 +71,12 @@ struct stripe_request {

			/* Array of buffers for reading chunk metadata */
			void **chunk_md_buffers;

			/* Chunk to reconstruct from parity */
			struct chunk *chunk;

			/* Offset from chunk start */
			uint64_t chunk_offset;
		} reconstruct;
	};

@@ -278,46 +284,60 @@ raid5f_xor_stripe(struct stripe_request *stripe_req, stripe_req_xor_cb cb)
	struct raid_bdev_io *raid_io = stripe_req->raid_io;
	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
	void *raid_md = spdk_bdev_io_get_md_buf(bdev_io);
	uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev);
	struct chunk *chunk;
	struct chunk *dest_chunk;
	uint64_t num_blocks;
	uint8_t c;

	assert(cb != NULL);
	assert(stripe_req->type == STRIPE_REQ_WRITE);

	if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) {
		num_blocks = raid_bdev->strip_size;
		dest_chunk = stripe_req->parity_chunk;
	} else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) {
		num_blocks = bdev_io->u.bdev.num_blocks;
		dest_chunk = stripe_req->reconstruct.chunk;
	} else {
		assert(false);
	}

	c = 0;
	FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
	FOR_EACH_CHUNK(stripe_req, chunk) {
		if (chunk == dest_chunk) {
			continue;
		}
		r5ch->chunk_xor_iovs[c] = chunk->iovs;
		r5ch->chunk_xor_iovcnt[c] = chunk->iovcnt;
		c++;
	}
	r5ch->chunk_xor_iovs[c] = stripe_req->parity_chunk->iovs;
	r5ch->chunk_xor_iovcnt[c] = stripe_req->parity_chunk->iovcnt;
	r5ch->chunk_xor_iovs[c] = dest_chunk->iovs;
	r5ch->chunk_xor_iovcnt[c] = dest_chunk->iovcnt;

	stripe_req->xor.len = spdk_ioviter_firstv(stripe_req->chunk_iov_iters,
			      raid_bdev->num_base_bdevs,
			      r5ch->chunk_xor_iovs,
			      r5ch->chunk_xor_iovcnt,
			      r5ch->chunk_xor_buffers);
	stripe_req->xor.remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift;
	stripe_req->xor.remaining = num_blocks << raid_bdev->blocklen_shift;
	stripe_req->xor.status = 0;
	stripe_req->xor.cb = cb;

	if (raid_md != NULL) {
	if (spdk_bdev_io_get_md_buf(bdev_io)) {
		uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev);
		uint64_t len = raid_bdev->strip_size * raid_md_size;
		uint64_t len = num_blocks * spdk_bdev_get_md_size(&raid_bdev->bdev);
		int ret;

		stripe_req->xor.remaining_md = len;

		c = 0;
		FOR_EACH_DATA_CHUNK(stripe_req, chunk) {
		FOR_EACH_CHUNK(stripe_req, chunk) {
			if (chunk != dest_chunk) {
				stripe_req->chunk_xor_md_buffers[c] = chunk->md_buf;
				c++;
			}
		}

		ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, stripe_req->write.parity_md_buf,
		ret = spdk_accel_submit_xor(stripe_req->r5ch->accel_ch, dest_chunk->md_buf,
					    stripe_req->chunk_xor_md_buffers, n_src, len,
					    raid5f_xor_stripe_md_cb, stripe_req);
		if (spdk_unlikely(ret)) {
@@ -353,6 +373,35 @@ raid5f_stripe_request_chunk_write_complete(struct stripe_request *stripe_req,
	}
}

static void
raid5f_stripe_request_reconstruct_xor_done(struct stripe_request *stripe_req, int status)
{
	struct raid_bdev_io *raid_io = stripe_req->raid_io;

	raid5f_stripe_request_release(stripe_req);

	raid_bdev_io_complete_part(raid_io, 1,
				   status == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
}

static void
raid5f_stripe_request_chunk_read_complete(struct stripe_request *stripe_req,
		enum spdk_bdev_io_status status)
{
	struct raid_bdev_io *raid_io = stripe_req->raid_io;

	if (raid_io->base_bdev_io_remaining == 1) {
		if (raid_io->base_bdev_io_status == SPDK_BDEV_IO_STATUS_SUCCESS &&
		    status == SPDK_BDEV_IO_STATUS_SUCCESS) {
			raid5f_xor_stripe(stripe_req, raid5f_stripe_request_reconstruct_xor_done);
			return;
		}
		raid5f_stripe_request_release(stripe_req);
	}

	raid_bdev_io_complete_part(raid_io, 1, status);
}

static void
raid5f_chunk_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
@@ -365,6 +414,8 @@ raid5f_chunk_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *

	if (spdk_likely(stripe_req->type == STRIPE_REQ_WRITE)) {
		raid5f_stripe_request_chunk_write_complete(stripe_req, status);
	} else if (stripe_req->type == STRIPE_REQ_RECONSTRUCT) {
		raid5f_stripe_request_chunk_read_complete(stripe_req, status);
	} else {
		assert(false);
	}
@@ -420,6 +471,19 @@ raid5f_chunk_submit(struct chunk *chunk)
						  raid5f_chunk_complete_bdev_io, chunk,
						  &chunk->ext_opts);
		break;
	case STRIPE_REQ_RECONSTRUCT:
		if (chunk == stripe_req->reconstruct.chunk) {
			raid_bdev_io_complete_part(raid_io, 1, SPDK_BDEV_IO_STATUS_SUCCESS);
			return 0;
		}

		base_offset_blocks += stripe_req->reconstruct.chunk_offset;

		ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt,
						 base_offset_blocks, bdev_io->u.bdev.num_blocks,
						 raid5f_chunk_complete_bdev_io, chunk,
						 &chunk->ext_opts);
		break;
	default:
		assert(false);
		ret = -EINVAL;
@@ -437,10 +501,17 @@ raid5f_chunk_submit(struct chunk *chunk)
			 * these means there are no more to complete for the stripe request, we can
			 * release the stripe request as well.
			 */
			uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
			uint64_t base_bdev_io_not_submitted;

			if (stripe_req->type == STRIPE_REQ_WRITE) {
				base_bdev_io_not_submitted = raid_bdev->num_base_bdevs -
							     raid_io->base_bdev_io_submitted;
			} else {
				base_bdev_io_not_submitted = raid5f_stripe_data_chunks_num(raid_bdev) -
							     raid_io->base_bdev_io_submitted;
			}

			if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted,
			if (raid_bdev_io_complete_part(raid_io, base_bdev_io_not_submitted,
						       SPDK_BDEV_IO_STATUS_FAILED)) {
				raid5f_stripe_request_release(stripe_req);
			}
@@ -552,6 +623,16 @@ raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req)
	}
}

static inline void
raid5f_stripe_request_init(struct stripe_request *stripe_req, struct raid_bdev_io *raid_io,
			   uint64_t stripe_index)
{
	stripe_req->raid_io = raid_io;
	stripe_req->stripe_index = stripe_index;
	stripe_req->parity_chunk = &stripe_req->chunks[raid5f_stripe_parity_chunk_index(raid_io->raid_bdev,
				   stripe_index)];
}

static void
raid5f_stripe_write_request_xor_done(struct stripe_request *stripe_req, int status)
{
@@ -578,10 +659,7 @@ raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index)
		return -ENOMEM;
	}

	stripe_req->stripe_index = stripe_index;
	stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev,
				   stripe_req->stripe_index);
	stripe_req->raid_io = raid_io;
	raid5f_stripe_request_init(stripe_req, raid_io, stripe_index);

	ret = raid5f_stripe_request_map_iovecs(stripe_req);
	if (spdk_unlikely(ret)) {
@@ -623,6 +701,69 @@ _raid5f_submit_rw_request(void *_raid_io)
	raid5f_submit_rw_request(raid_io);
}

static int
raid5f_submit_reconstruct_read(struct raid_bdev_io *raid_io, uint64_t stripe_index,
			       uint8_t chunk_idx, uint64_t chunk_offset)
{
	struct raid_bdev *raid_bdev = raid_io->raid_bdev;
	struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel);
	struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
	void *bdev_io_md = spdk_bdev_io_get_md_buf(bdev_io);
	struct stripe_request *stripe_req;
	struct chunk *chunk;
	int buf_idx;

	stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests.reconstruct);
	if (!stripe_req) {
		return -ENOMEM;
	}

	raid5f_stripe_request_init(stripe_req, raid_io, stripe_index);

	stripe_req->reconstruct.chunk = &stripe_req->chunks[chunk_idx];
	stripe_req->reconstruct.chunk_offset = chunk_offset;
	buf_idx = 0;

	FOR_EACH_CHUNK(stripe_req, chunk) {
		if (chunk == stripe_req->reconstruct.chunk) {
			int i;
			int ret;

			ret = raid5f_chunk_set_iovcnt(chunk, bdev_io->u.bdev.iovcnt);
			if (ret) {
				return ret;
			}

			for (i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
				chunk->iovs[i] = bdev_io->u.bdev.iovs[i];
			}

			chunk->md_buf = bdev_io_md;
		} else {
			struct iovec *iov = &chunk->iovs[0];

			iov->iov_base = stripe_req->reconstruct.chunk_buffers[buf_idx];
			iov->iov_len = bdev_io->u.bdev.num_blocks << raid_bdev->blocklen_shift;
			chunk->iovcnt = 1;

			if (bdev_io_md) {
				chunk->md_buf = stripe_req->reconstruct.chunk_md_buffers[buf_idx];
			}

			buf_idx++;
		}
	}

	raid_io->module_private = stripe_req;
	raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;

	TAILQ_REMOVE(&r5ch->free_stripe_requests.reconstruct, stripe_req, link);

	raid5f_stripe_request_submit_chunks(stripe_req);

	return 0;
}

static int
raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
			   uint64_t stripe_offset)
@@ -640,6 +781,10 @@ raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index,
	int ret;

	raid5f_init_ext_io_opts(bdev_io, &io_opts);
	if (base_ch == NULL) {
		return raid5f_submit_reconstruct_read(raid_io, stripe_index, chunk_idx, chunk_offset);
	}

	ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
					 bdev_io->u.bdev.iovcnt,
					 base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io,
+113 −2
Original line number Diff line number Diff line
@@ -239,6 +239,7 @@ struct raid_io_info {
	enum spdk_bdev_io_type io_type;
	uint64_t stripe_index;
	uint64_t offset_blocks;
	uint64_t stripe_offset_blocks;
	uint64_t num_blocks;
	void *src_buf;
	void *dest_buf;
@@ -251,6 +252,8 @@ struct raid_io_info {
	void *parity_md_buf;
	void *reference_md_parity;
	size_t parity_md_buf_size;
	void *degraded_buf;
	void *degraded_md_buf;
	enum spdk_bdev_io_status status;
	TAILQ_HEAD(, spdk_bdev_io) bdev_io_queue;
	TAILQ_HEAD(, spdk_bdev_io_wait_entry) bdev_io_wait_queue;
@@ -469,6 +472,51 @@ submit:
	return submit_io(io_info, desc, cb, cb_arg);
}

static int
spdk_bdev_readv_blocks_degraded(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
				struct iovec *iov, int iovcnt, void *md_buf,
				uint64_t offset_blocks, uint64_t num_blocks,
				spdk_bdev_io_completion_cb cb, void *cb_arg)
{
	struct chunk *chunk = cb_arg;
	struct stripe_request *stripe_req;
	struct test_raid_bdev_io *test_raid_bdev_io;
	struct raid_io_info *io_info;
	struct raid_bdev *raid_bdev;
	uint8_t data_chunk_idx;
	void *buf, *buf_md;

	SPDK_CU_ASSERT_FATAL(cb == raid5f_chunk_complete_bdev_io);
	SPDK_CU_ASSERT_FATAL(iovcnt == 1);

	stripe_req = raid5f_chunk_stripe_req(chunk);
	test_raid_bdev_io = (struct test_raid_bdev_io *)spdk_bdev_io_from_ctx(stripe_req->raid_io);
	io_info = test_raid_bdev_io->io_info;
	raid_bdev = io_info->r5f_info->raid_bdev;

	if (chunk == stripe_req->parity_chunk) {
		buf = io_info->reference_parity;
		buf_md = io_info->reference_md_parity;
	} else {
		data_chunk_idx = chunk < stripe_req->parity_chunk ? chunk->index : chunk->index - 1;
		buf = io_info->degraded_buf +
		      data_chunk_idx * raid_bdev->strip_size * raid_bdev->bdev.blocklen;
		buf_md = io_info->degraded_md_buf +
			 data_chunk_idx * raid_bdev->strip_size * raid_bdev->bdev.md_len;
	}

	buf += (offset_blocks % raid_bdev->strip_size) * raid_bdev->bdev.blocklen;
	buf_md += (offset_blocks % raid_bdev->strip_size) * raid_bdev->bdev.md_len;

	SPDK_CU_ASSERT_FATAL(num_blocks * raid_bdev->bdev.blocklen <= iov->iov_len);
	memcpy(iov->iov_base, buf, num_blocks * raid_bdev->bdev.blocklen);
	if (md_buf != NULL) {
		memcpy(md_buf, buf_md, num_blocks * raid_bdev->bdev.md_len);
	}

	return submit_io(io_info, desc, cb, cb_arg);
}

int
spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
			struct iovec *iov, int iovcnt,
@@ -501,11 +549,16 @@ spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_chann
	struct raid_bdev_io *raid_io = cb_arg;
	struct test_raid_bdev_io *test_raid_bdev_io;

	SPDK_CU_ASSERT_FATAL(cb == raid5f_chunk_read_complete);
	SPDK_CU_ASSERT_FATAL(iovcnt == 1);
	if (cb == raid5f_chunk_complete_bdev_io) {
		return spdk_bdev_readv_blocks_degraded(desc, ch, iov, iovcnt, md_buf, offset_blocks,
						       num_blocks, cb, cb_arg);
	}

	test_raid_bdev_io = (struct test_raid_bdev_io *)spdk_bdev_io_from_ctx(raid_io);

	SPDK_CU_ASSERT_FATAL(cb == raid5f_chunk_read_complete);
	SPDK_CU_ASSERT_FATAL(iovcnt == 1);

	memcpy(iov->iov_base, test_raid_bdev_io->buf, iov->iov_len);
	if (md_buf != NULL) {
		memcpy(md_buf, test_raid_bdev_io->buf_md, DATA_OFFSET_TO_MD_OFFSET(raid_io->raid_bdev,
@@ -616,6 +669,11 @@ test_raid5f_read_request(struct raid_io_info *io_info)
	raid5f_submit_rw_request(raid_io);

	process_io_completions(io_info);

	if (g_test_degraded) {
		/* for the reconstruct read xor callback */
		poll_threads();
	}
}

static void
@@ -629,6 +687,8 @@ deinit_io_info(struct raid_io_info *io_info)
	free(io_info->reference_parity);
	free(io_info->parity_md_buf);
	free(io_info->reference_md_parity);
	free(io_info->degraded_buf);
	free(io_info->degraded_md_buf);
}

static void
@@ -686,6 +746,7 @@ init_io_info(struct raid_io_info *io_info, struct raid5f_info *r5f_info,
	io_info->io_type = io_type;
	io_info->stripe_index = stripe_index;
	io_info->offset_blocks = stripe_index * r5f_info->stripe_blocks + stripe_offset_blocks;
	io_info->stripe_offset_blocks = stripe_offset_blocks;
	io_info->num_blocks = num_blocks;
	io_info->src_buf = src_buf;
	io_info->dest_buf = dest_buf;
@@ -736,6 +797,45 @@ io_info_setup_parity(struct raid_io_info *io_info, void *src, void *src_md)
	}
}

static void
io_info_setup_degraded(struct raid_io_info *io_info)
{
	struct raid5f_info *r5f_info = io_info->r5f_info;
	struct raid_bdev *raid_bdev = r5f_info->raid_bdev;
	uint32_t blocklen = raid_bdev->bdev.blocklen;
	uint32_t md_len = raid_bdev->bdev.md_len;
	size_t stripe_len = r5f_info->stripe_blocks * blocklen;
	size_t stripe_md_len = r5f_info->stripe_blocks * md_len;

	io_info->degraded_buf = malloc(stripe_len);
	SPDK_CU_ASSERT_FATAL(io_info->degraded_buf != NULL);

	memset(io_info->degraded_buf, 0xab, stripe_len);

	memcpy(io_info->degraded_buf + io_info->stripe_offset_blocks * blocklen,
	       io_info->src_buf, io_info->num_blocks * blocklen);

	if (stripe_md_len != 0) {
		io_info->degraded_md_buf = malloc(stripe_md_len);
		SPDK_CU_ASSERT_FATAL(io_info->degraded_md_buf != NULL);

		memset(io_info->degraded_md_buf, 0xab, stripe_md_len);

		memcpy(io_info->degraded_md_buf + io_info->stripe_offset_blocks * md_len,
		       io_info->src_md_buf, io_info->num_blocks * md_len);
	}

	io_info_setup_parity(io_info, io_info->degraded_buf, io_info->degraded_md_buf);

	memset(io_info->degraded_buf + io_info->stripe_offset_blocks * blocklen,
	       0xcd, io_info->num_blocks * blocklen);

	if (stripe_md_len != 0) {
		memset(io_info->degraded_md_buf + io_info->stripe_offset_blocks * md_len,
		       0xcd, io_info->num_blocks * md_len);
	}
}

static void
test_raid5f_submit_rw_request(struct raid5f_info *r5f_info, struct raid_bdev_io_channel *raid_ch,
			      enum spdk_bdev_io_type io_type, uint64_t stripe_index, uint64_t stripe_offset_blocks,
@@ -747,6 +847,9 @@ test_raid5f_submit_rw_request(struct raid5f_info *r5f_info, struct raid_bdev_io_

	switch (io_type) {
	case SPDK_BDEV_IO_TYPE_READ:
		if (g_test_degraded) {
			io_info_setup_degraded(&io_info);
		}
		test_raid5f_read_request(&io_info);
		break;
	case SPDK_BDEV_IO_TYPE_WRITE:
@@ -1037,6 +1140,13 @@ test_raid5f_submit_full_stripe_write_request_degraded(void)
	run_for_each_raid5f_config(__test_raid5f_submit_full_stripe_write_request);
}

static void
test_raid5f_submit_read_request_degraded(void)
{
	g_test_degraded = true;
	run_for_each_raid5f_config(__test_raid5f_submit_read_request);
}

int
main(int argc, char **argv)
{
@@ -1054,6 +1164,7 @@ main(int argc, char **argv)
	CU_ADD_TEST(suite, test_raid5f_chunk_write_error);
	CU_ADD_TEST(suite, test_raid5f_chunk_write_error_with_enomem);
	CU_ADD_TEST(suite, test_raid5f_submit_full_stripe_write_request_degraded);
	CU_ADD_TEST(suite, test_raid5f_submit_read_request_degraded);

	allocate_threads(1);
	set_thread(0);