Commit 255d0786 authored by Artur Paszkiewicz's avatar Artur Paszkiewicz Committed by Tomasz Zawadzki
Browse files

module/raid: allow assembly of a degraded raid



Add num_base_bdevs_operational to raid_bdev and use it to determine the
required number of base bdevs.

Change-Id: I31b39cc8ea708b6cdce748f015949e4c9fdeb3cd
Signed-off-by: default avatarArtur Paszkiewicz <artur.paszkiewicz@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16167


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Reviewed-by: default avatarJim Harris <jim.harris@samsung.com>
Community-CI: Mellanox Build Bot
parent fd92e702
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -10453,6 +10453,7 @@ Example response:
      "raid_level": "raid0",
      "num_base_bdevs": 2,
      "num_base_bdevs_discovered": 2,
      "num_base_bdevs_operational": 2,
      "base_bdevs_list": [
        {
          "name": "malloc0",
@@ -10478,6 +10479,7 @@ Example response:
      "raid_level": "raid0",
      "num_base_bdevs": 2,
      "num_base_bdevs_discovered": 1,
      "num_base_bdevs_operational": 2,
      "base_bdevs_list": [
        {
          "name": "malloc2",
+28 −6
Original line number Diff line number Diff line
@@ -617,6 +617,8 @@ raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ct
	spdk_json_write_named_bool(w, "superblock", raid_bdev->sb != NULL);
	spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
	spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
	spdk_json_write_named_uint32(w, "num_base_bdevs_operational",
				     raid_bdev->num_base_bdevs_operational);
	spdk_json_write_name(w, "base_bdevs_list");
	spdk_json_write_array_begin(w);
	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
@@ -1123,6 +1125,8 @@ raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
		spdk_uuid_generate(&raid_bdev->bdev.uuid);
	}

	raid_bdev->num_base_bdevs_operational = num_base_bdevs;

	*raid_bdev_out = raid_bdev;

	return 0;
@@ -1145,6 +1149,9 @@ raid_bdev_configure_md(struct raid_bdev *raid_bdev)
	uint8_t i;

	for (i = 0; i < raid_bdev->num_base_bdevs; i++) {
		if (raid_bdev->base_bdev_info[i].desc == NULL) {
			continue;
		}
		base_bdev = spdk_bdev_desc_get_bdev(raid_bdev->base_bdev_info[i].desc);

		/* Currently, RAID bdevs do not support DIF or DIX, so a RAID bdev cannot
@@ -1233,11 +1240,14 @@ raid_bdev_configure(struct raid_bdev *raid_bdev)
	int rc = 0;

	assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING);
	assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs);
	assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational);

	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
		assert(base_info->desc != NULL);
		if (base_info->desc == NULL) {
			continue;
		}
		base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);

		/* Check blocklen for all base bdevs that it should be same */
		if (blocklen == 0) {
			blocklen = base_bdev->blocklen;
@@ -1519,13 +1529,16 @@ raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_bdev_remove_base_bd
		/*
		 * As raid bdev is not registered yet or already unregistered,
		 * so cleanup should be done here itself.
		 *
		 * Removing a base bdev at this stage does not change the number of operational
		 * base bdevs, only the number of discovered base bdevs.
		 */
		raid_bdev_free_base_bdev_resource(base_info);
		if (raid_bdev->num_base_bdevs_discovered == 0) {
			/* There is no base bdev for this raid, so free the raid device. */
			raid_bdev_cleanup_and_free(raid_bdev);
		}
	} else if (raid_bdev->num_base_bdevs_discovered == raid_bdev->min_base_bdevs_operational) {
	} else if (raid_bdev->num_base_bdevs_operational-- == raid_bdev->min_base_bdevs_operational) {
		/*
		 * After this base bdev is removed there will not be enough base bdevs
		 * to keep the raid bdev operational.
@@ -1674,8 +1687,16 @@ raid_bdev_configure_base_bdev_cont(struct raid_base_bdev_info *base_info)

	raid_bdev->num_base_bdevs_discovered++;
	assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
	assert(raid_bdev->num_base_bdevs_operational <= raid_bdev->num_base_bdevs);
	assert(raid_bdev->num_base_bdevs_operational >= raid_bdev->min_base_bdevs_operational);

	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
	/*
	 * Configure the raid bdev when the number of discovered base bdevs reaches the number
	 * of base bdevs we know to be operational members of the array. Usually this is equal
	 * to the total number of base bdevs (num_base_bdevs) but can be less - when the array is
	 * degraded.
	 */
	if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs_operational) {
		rc = raid_bdev_configure(raid_bdev);
		if (rc != 0) {
			SPDK_ERRLOG("Failed to configure raid bdev: %s\n", spdk_strerror(-rc));
@@ -1913,7 +1934,7 @@ raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb, struct raid_bdev
	uint8_t i;
	int rc;

	rc = raid_bdev_create(sb->name, (sb->strip_size * sb->block_size) / 1024, sb->num_base_bdevs,
	rc = _raid_bdev_create(sb->name, (sb->strip_size * sb->block_size) / 1024, sb->num_base_bdevs,
			       sb->level, true, &sb->uuid, &raid_bdev);
	if (rc != 0) {
		return rc;
@@ -1928,6 +1949,7 @@ raid_bdev_create_from_sb(const struct raid_bdev_superblock *sb, struct raid_bdev

		if (sb_base_bdev->state == RAID_SB_BASE_BDEV_CONFIGURED) {
			spdk_uuid_copy(&base_info->uuid, &sb_base_bdev->uuid);
			raid_bdev->num_base_bdevs_operational++;
		}

		base_info->data_offset = sb_base_bdev->data_offset;
+10 −1
Original line number Diff line number Diff line
@@ -60,7 +60,10 @@ struct raid_base_bdev_info {
	/* uuid of the bdev */
	struct spdk_uuid	uuid;

	/* pointer to base bdev descriptor opened by raid bdev */
	/*
	 * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for
	 * this slot is missing.
	 */
	struct spdk_bdev_desc	*desc;

	/* offset in blocks from the start of the base bdev to the start of the data region */
@@ -154,6 +157,12 @@ struct raid_bdev {
	/* number of base bdevs discovered */
	uint8_t				num_base_bdevs_discovered;

	/*
	 * Number of operational base bdevs, i.e. how many we know/expect to be working. This
	 * will be less than num_base_bdevs when starting a degraded array.
	 */
	uint8_t				num_base_bdevs_operational;

	/* minimum number of viable base bdevs that are required by array to operate */
	uint8_t				min_base_bdevs_operational;

+5 −3
Original line number Diff line number Diff line
@@ -1066,6 +1066,7 @@ raid5f_start(struct raid_bdev *raid_bdev)
	uint64_t min_blockcnt = UINT64_MAX;
	uint64_t base_bdev_data_size;
	struct raid_base_bdev_info *base_info;
	struct spdk_bdev *base_bdev;
	struct raid5f_info *r5f_info;
	size_t alignment = 0;

@@ -1077,11 +1078,12 @@ raid5f_start(struct raid_bdev *raid_bdev)
	r5f_info->raid_bdev = raid_bdev;

	RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
		struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);

		min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
		if (base_info->desc) {
			base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
			alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_bdev));
		}
	}

	base_bdev_data_size = (min_blockcnt / raid_bdev->strip_size) * raid_bdev->strip_size;

+102 −26
Original line number Diff line number Diff line
@@ -118,21 +118,22 @@ function raid_function_test() {
	return 0
}

function verify_raid_bdev_state() (
	set +x
function verify_raid_bdev_state() {
	local raid_bdev_name=$1
	local expected_state=$2
	local raid_level=$3
	local strip_size=$4
	local raid_bdev
	local num_base_bdevs_operational=$5
	local raid_bdev_info
	local num_base_bdevs
	local num_base_bdevs_discovered
	local tmp

	raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[0] | select(.)')
	if [ -z "$raid_bdev" ]; then
		echo "No raid device in SPDK app"
	raid_bdev_info=$($rpc_py bdev_raid_get_bdevs all | jq -r ".[] | select(.name == \"$raid_bdev_name\")")

	xtrace_disable
	if [ -z "$raid_bdev_info" ]; then
		echo "No raid device \"$raid_bdev_name\" in SPDK app"
		return 1
	fi

@@ -173,7 +174,15 @@ function verify_raid_bdev_state() (
		echo "incorrect num_base_bdevs_discovered: $tmp, expected: $num_base_bdevs_discovered"
		return 1
	fi
)

	tmp=$(echo $raid_bdev_info | jq -r '.num_base_bdevs_operational')
	if [ "$num_base_bdevs_operational" != "$tmp" ]; then
		echo "incorrect num_base_bdevs_operational $tmp, expected: $num_base_bdevs_operational"
		return 1
	fi

	xtrace_restore
}

function has_redundancy() {
	case $1 in
@@ -214,9 +223,7 @@ function raid_state_function_test() {
	# Step1: create a RAID bdev with no base bdevs
	# Expect state: CONFIGURING
	$rpc_py bdev_raid_create $strip_size_create_arg $superblock_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
	if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
		return 1
	fi
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs
	$rpc_py bdev_raid_delete $raid_bdev_name

	# Step2: create one base bdev and add to the RAID bdev
@@ -224,9 +231,7 @@ function raid_state_function_test() {
	$rpc_py bdev_raid_create $strip_size_create_arg $superblock_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
	$rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[0]}
	waitforbdev ${base_bdevs[0]}
	if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
		return 1
	fi
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs
	$rpc_py bdev_raid_delete $raid_bdev_name

	if [ $superblock = true ]; then
@@ -240,15 +245,11 @@ function raid_state_function_test() {
	# Expect state: ONLINE
	$rpc_py bdev_raid_create $strip_size_create_arg $superblock_create_arg -r $raid_level -b "${base_bdevs[*]}" -n $raid_bdev_name
	for ((i = 1; i < num_base_bdevs; i++)); do
		if ! verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size; then
			return 1
		fi
		verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs
		$rpc_py bdev_malloc_create 32 512 -b ${base_bdevs[$i]}
		waitforbdev ${base_bdevs[$i]}
	done
	if ! verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size; then
		return 1
	fi
	verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs

	# Step4: delete one base bdev from the RAID bdev
	$rpc_py bdev_malloc_delete ${base_bdevs[0]}
@@ -258,9 +259,7 @@ function raid_state_function_test() {
	else
		expected_state="online"
	fi
	if ! verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size; then
		return 1
	fi
	verify_raid_bdev_state $raid_bdev_name $expected_state $raid_level $strip_size $((num_base_bdevs - 1))

	# Step5: delete remaining base bdevs from the RAID bdev
	# Expect state: removed from system
@@ -367,7 +366,7 @@ function raid_superblock_test() {

	# Create RAID bdev with superblock
	$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_pt[*]}" -n $raid_bdev_name -s
	verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size
	verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs

	# Get RAID bdev's UUID
	raid_bdev_uuid=$($rpc_py bdev_get_bdevs -b $raid_bdev_name | jq -r '.[] | .uuid')
@@ -393,7 +392,7 @@ function raid_superblock_test() {
	# Try to create new RAID bdev from malloc bdevs
	# Should not reach online state due to superblock still present on base bdevs
	$rpc_py bdev_raid_create $strip_size_create_arg -r $raid_level -b "${base_bdevs_malloc[*]}" -n $raid_bdev_name
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs

	# Stop the RAID bdev
	$rpc_py bdev_raid_delete $raid_bdev_name
@@ -406,7 +405,7 @@ function raid_superblock_test() {
	$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]}

	# Check if the RAID bdev was assembled from superblock
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size
	verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs

	if [ $num_base_bdevs -gt 2 ]; then
		# Re-add the second base bdev and remove it again
@@ -421,13 +420,90 @@ function raid_superblock_test() {
	done

	# Check if the RAID bdev is in online state
	verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size
	verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $num_base_bdevs

	# Check if the RAID bdev has the same UUID as when first created
	if [ "$($rpc_py bdev_get_bdevs -b $raid_bdev_name | jq -r '.[] | .uuid')" != "$raid_bdev_uuid" ]; then
		return 1
	fi

	if has_redundancy $raid_level; then
		# Delete one base bdev
		$rpc_py bdev_passthru_delete ${base_bdevs_pt[0]}

		# Check if the RAID bdev is in online state (degraded)
		verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1))

		# Stop the RAID bdev
		$rpc_py bdev_raid_delete $raid_bdev_name
		raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]')
		if [ -n "$raid_bdev" ]; then
			return 1
		fi

		# Delete remaining base bdevs
		for ((i = 1; i < num_base_bdevs; i++)); do
			$rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]}
		done

		# Re-add base bdevs from the second up to (not including) the last one
		for ((i = 1; i < num_base_bdevs - 1; i++)); do
			$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}

			# Check if the RAID bdev is in configuring state
			verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1))
		done

		# Re-add the last base bdev
		i=$((num_base_bdevs - 1))
		$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}

		# Check if the RAID bdev is in online state (degraded)
		verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1))

		if [ $num_base_bdevs -gt 2 ]; then
			# Stop the RAID bdev
			$rpc_py bdev_raid_delete $raid_bdev_name
			raid_bdev=$($rpc_py bdev_raid_get_bdevs all | jq -r '.[]')
			if [ -n "$raid_bdev" ]; then
				return 1
			fi

			# Re-add first base bdev
			# This is the "failed" device and contains the "old" version of the superblock
			$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[0]} -p ${base_bdevs_pt[0]} -u ${base_bdevs_pt_uuid[0]}

			# Check if the RAID bdev is in configuring state
			verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $num_base_bdevs

			# Delete remaining base bdevs
			for ((i = 1; i < num_base_bdevs; i++)); do
				$rpc_py bdev_passthru_delete ${base_bdevs_pt[$i]}
			done

			# Re-add the last base bdev
			i=$((num_base_bdevs - 1))
			$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}

			# Check if the RAID bdev is in configuring state
			# This should use the newer superblock version and have n-1 online base bdevs
			verify_raid_bdev_state $raid_bdev_name "configuring" $raid_level $strip_size $((num_base_bdevs - 1))

			# Re-add remaining base bdevs
			for ((i = 1; i < num_base_bdevs - 1; i++)); do
				$rpc_py bdev_passthru_create -b ${base_bdevs_malloc[$i]} -p ${base_bdevs_pt[$i]} -u ${base_bdevs_pt_uuid[$i]}
			done

			# Check if the RAID bdev is in online state (degraded)
			verify_raid_bdev_state $raid_bdev_name "online" $raid_level $strip_size $((num_base_bdevs - 1))
		fi

		# Check if the RAID bdev has the same UUID as when first created
		if [ "$($rpc_py bdev_get_bdevs -b $raid_bdev_name | jq -r '.[] | .uuid')" != "$raid_bdev_uuid" ]; then
			return 1
		fi
	fi

	killprocess $raid_pid

	return 0