sma: add volume cleanup thread (94308849) · Commits · Public Repositories / spdk

python/spdk/sma/sma.py

+4 −1

Original line number	Diff line number	Diff line
		@@ -15,7 +15,8 @@ class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer):
		self._devices = {}
		self._server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
		self._server.add_insecure_port(f'{addr}:{port}')
		self._volume_mgr = VolumeManager(client, config['discovery_timeout'])
		self._volume_mgr = VolumeManager(client, config['discovery_timeout'],
		config['volume_cleanup_period'])
		pb2_grpc.add_StorageManagementAgentServicer_to_server(self, self._server)

		def _grpc_method(f):
		@@ -28,10 +29,12 @@ class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer):
		self._devices[device_manager.protocol] = device_manager

		def start(self):
		self._volume_mgr.start()
		self._server.start()

		def stop(self):
		self._server.stop(None)
		self._volume_mgr.stop()

		def _find_device_by_name(self, name):
		return self._devices.get(name)

python/spdk/sma/volume/volume.py

+66 −4

Original line number	Diff line number	Diff line
		import grpc
		import ipaddress
		import logging
		import threading
		import uuid
		from dataclasses import dataclass
		from spdk.rpc.client import JSONRPCException
		@@ -25,13 +26,67 @@ class Volume:


		class VolumeManager:
		def __init__(self, client, discovery_timeout):
		def __init__(self, client, discovery_timeout, cleanup_period):
		self._client = client
		# Discovery service map (name -> refcnt)
		self._discovery = {}
		# Volume map (volume_id -> Volume)
		self._volumes = {}
		self._discovery_timeout = int(discovery_timeout * 1000)
		self._cleanup_period = cleanup_period
		self._lock = threading.Lock()
		self._cv = threading.Condition(self._lock)
		self._running = False
		self._thread = None

		def _locked(f):
		def wrapper(self, args, *kwargs):
		self._lock.acquire()
		try:
		return f(self, args, *kwargs)
		finally:
		self._lock.release()
		return wrapper

		def start(self):
		if self._thread is not None:
		raise ValueError('Volume manager was already started')
		self._running = True
		self._thread = threading.Thread(target=self._cleanup_thread, args=(self,))
		self._thread.start()

		def stop(self):
		if self._thread is None:
		return
		with self._lock:
		self._running = False
		self._cv.notify_all()
		self._thread.join()
		self._thread = None

		@staticmethod
		def _cleanup_thread(*args):
		self, = args
		with self._lock:
		while self._running:
		self._cleanup_volumes()
		self._cv.wait(self._cleanup_period)

		def _cleanup_volumes(self):
		try:
		disconnected = []
		with self._client() as client:
		bdevs = client.call('bdev_get_bdevs')
		for volume_id in self._volumes:
		if volume_id not in [b['uuid'] for b in bdevs]:
		log.warning(f'Found disconnected volume: {volume_id}')
		disconnected.append(volume_id)
		for volume_id in disconnected:
		self._disconnect_volume(volume_id)
		except VolumeException as ex:
		log.error(f'Failure when trying to disconnect volumes: {ex.message}')
		except JSONRPCException as ex:
		log.error(f'Failed to retrieve bdevs: {ex.message}')

		def _get_discovery_info(self):
		try:
		@@ -102,6 +157,7 @@ class VolumeManager:
		raise VolumeException(grpc.StatusCode.INTERNAL,
		'Failed to stop discovery')

		@_locked
		def connect_volume(self, params, device_handle=None):
		""" Connects a volume through a discovery service. Returns a tuple (volume_id, existing):
		the first item is a volume_id as str, while the second denotes whether the selected volume
		@@ -173,8 +229,7 @@ class VolumeManager:
		raise ex
		return volume_id, False

		def disconnect_volume(self, volume_id):
		"""Disconnects a volume connected through discovery service"""
		def _disconnect_volume(self, volume_id):
		id = format_volume_id(volume_id)
		if id is None:
		raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT,
		@@ -193,6 +248,12 @@ class VolumeManager:
		log.error(f'Failed to stop discovery service: {name}')
		del self._volumes[id]

		@_locked
		def disconnect_volume(self, volume_id):
		"""Disconnects a volume connected through discovery service"""
		return self._disconnect_volume(volume_id)

		@_locked
		def set_device(self, volume_id, device_handle):
		"""Marks a previously connected volume as being attached to specified device. This is only
		necessary if the device handle is not known at a time a volume is connected.
		@@ -210,8 +271,9 @@ class VolumeManager:
		'Volume is already attached to a different device')
		volume.device_handle = device_handle

		@_locked
		def disconnect_device_volumes(self, device_handle):
		"""Disconnects all volumes attached to a specific device"""
		volumes = [i for i, v in self._volumes.items() if v.device_handle == device_handle]
		for volume_id in volumes:
		self.disconnect_volume(volume_id)
		self._disconnect_volume(volume_id)

scripts/sma.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -33,7 +33,8 @@ def parse_argv():
		defaults = {'address': 'localhost',
		'socket': '/var/tmp/spdk.sock',
		'port': 8080,
		'discovery_timeout': 10.0}
		'discovery_timeout': 10.0,
		'volume_cleanup_period': 60.0}
		# Merge the default values, config file, and the command-line
		args = vars(parser.parse_args())
		config = parse_config(args.get('config'))

test/sma/discovery.sh

+31 −0

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@ t2dscport2=8011
		t1nqn='nqn.2016-06.io.spdk:node1'
		t2nqn='nqn.2016-06.io.spdk:node2'
		hostnqn='nqn.2016-06.io.spdk:host0'
		cleanup_period=1

		function cleanup() {
		killprocess $smapid
		@@ -141,6 +142,7 @@ tgtpid=$!
		$rootdir/scripts/sma.py -c <(
		cat <<- EOF
		discovery_timeout: 5
		volume_cleanup_period: $cleanup_period
		devices:
		- name: 'nvmf_tcp'
		EOF
		@@ -398,6 +400,35 @@ NOT attach_volume $device_id $(uuidgen) $invalid_port
		[[ $($rpc_py bdev_nvme_get_discovery_info \| jq -r '. \| length') -eq 1 ]]
		$rpc_py bdev_nvme_get_discovery_info \| jq -r '.[].trid.trsvcid' \| grep $t1dscport

		# Make sure that the discovery service is stopped if a volume is disconnected outside of SMA (e.g.
		# by removing it from the target)
		$rpc_py -s $t1sock nvmf_subsystem_remove_ns $t1nqn 1
		# Give SMA some time to be notified about the change
		sleep $((cleanup_period + 1))
		[[ $($rpc_py bdev_nvme_get_discovery_info \| jq -r '. \| length') -eq 0 ]]
		$rpc_py -s $t1sock nvmf_subsystem_add_ns $t1nqn $t1uuid

		# Do the same, but this time attach two volumes and check that the discovery service is only
		# stopped once both volumes are disconnected
		attach_volume $device_id $t2uuid $t2dscport1
		attach_volume $device_id $t2uuid2 $t2dscport1
		[[ $($rpc_py nvmf_get_subsystems $localnqn \| jq -r '.[].namespaces \| length') -eq 2 ]]
		[[ $($rpc_py bdev_nvme_get_discovery_info \| jq -r '. \| length') -eq 1 ]]
		$rpc_py -s $t2sock nvmf_subsystem_remove_ns $t2nqn 2
		# Give SMA some time to be notified about the change
		sleep $((cleanup_period + 1))
		# One of the volumes should be gone, but the discovery service should still be running
		[[ $($rpc_py nvmf_get_subsystems $localnqn \| jq -r '.[].namespaces \| length') -eq 1 ]]
		[[ $($rpc_py bdev_nvme_get_discovery_info \| jq -r '. \| length') -eq 1 ]]
		$rpc_py -s $t2sock nvmf_subsystem_remove_ns $t2nqn 1
		# Give SMA some time to be notified about the change
		sleep $((cleanup_period + 1))
		# Now that both are gone, the discovery service should be stopped too
		[[ $($rpc_py nvmf_get_subsystems $localnqn \| jq -r '.[].namespaces \| length') -eq 0 ]]
		[[ $($rpc_py bdev_nvme_get_discovery_info \| jq -r '. \| length') -eq 0 ]]
		$rpc_py -s $t2sock nvmf_subsystem_add_ns $t2nqn $t2uuid
		$rpc_py -s $t2sock nvmf_subsystem_add_ns $t2nqn $t2uuid2

		delete_device $device_id

		cleanup