Commit 94308849 authored by Konrad Sztyber's avatar Konrad Sztyber Committed by Tomasz Zawadzki
Browse files

sma: add volume cleanup thread



Since SMA keeps track of attached volumes and a volume might get
disconnected at any time (in which case they're also removed from the
host side), a mechanism is needed to monitor if any of the volumes are
no longer accessible.

This patch implements that mechanism by adding a new thread running in
the background that will periodically (60s by default) send a
bdev_get_bdevs RPC and check that all previously attached volumes are
available.  If any of them are not, it'll remove it and stop the
associated discovery services (if their refcount goes down to zero).
The period can be changed through the `volume_cleanup_period` variable
in the config file.

One important thing to note is that we assume that any intermittent
connection issues are handled internally by the SPDK application and a
bdev is only removed after all reconnection attempts are performed.

Signed-off-by: default avatarKonrad Sztyber <konrad.sztyber@intel.com>
Change-Id: I5b9e63698879527d9f79156a0eda1c8bc5e66def
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12699


Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
Reviewed-by: default avatarTomasz Zawadzki <tomasz.zawadzki@intel.com>
parent 3c60910e
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -15,7 +15,8 @@ class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer):
        self._devices = {}
        self._server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
        self._server.add_insecure_port(f'{addr}:{port}')
        self._volume_mgr = VolumeManager(client, config['discovery_timeout'])
        self._volume_mgr = VolumeManager(client, config['discovery_timeout'],
                                         config['volume_cleanup_period'])
        pb2_grpc.add_StorageManagementAgentServicer_to_server(self, self._server)

    def _grpc_method(f):
@@ -28,10 +29,12 @@ class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer):
        self._devices[device_manager.protocol] = device_manager

    def start(self):
        self._volume_mgr.start()
        self._server.start()

    def stop(self):
        self._server.stop(None)
        self._volume_mgr.stop()

    def _find_device_by_name(self, name):
        return self._devices.get(name)
+66 −4
Original line number Diff line number Diff line
import grpc
import ipaddress
import logging
import threading
import uuid
from dataclasses import dataclass
from spdk.rpc.client import JSONRPCException
@@ -25,13 +26,67 @@ class Volume:


class VolumeManager:
    def __init__(self, client, discovery_timeout):
    def __init__(self, client, discovery_timeout, cleanup_period):
        self._client = client
        # Discovery service map (name -> refcnt)
        self._discovery = {}
        # Volume map (volume_id -> Volume)
        self._volumes = {}
        self._discovery_timeout = int(discovery_timeout * 1000)
        self._cleanup_period = cleanup_period
        self._lock = threading.Lock()
        self._cv = threading.Condition(self._lock)
        self._running = False
        self._thread = None

    def _locked(f):
        def wrapper(self, *args, **kwargs):
            self._lock.acquire()
            try:
                return f(self, *args, **kwargs)
            finally:
                self._lock.release()
        return wrapper

    def start(self):
        if self._thread is not None:
            raise ValueError('Volume manager was already started')
        self._running = True
        self._thread = threading.Thread(target=self._cleanup_thread, args=(self,))
        self._thread.start()

    def stop(self):
        if self._thread is None:
            return
        with self._lock:
            self._running = False
            self._cv.notify_all()
        self._thread.join()
        self._thread = None

    @staticmethod
    def _cleanup_thread(*args):
        self, = args
        with self._lock:
            while self._running:
                self._cleanup_volumes()
                self._cv.wait(self._cleanup_period)

    def _cleanup_volumes(self):
        try:
            disconnected = []
            with self._client() as client:
                bdevs = client.call('bdev_get_bdevs')
                for volume_id in self._volumes:
                    if volume_id not in [b['uuid'] for b in bdevs]:
                        log.warning(f'Found disconnected volume: {volume_id}')
                        disconnected.append(volume_id)
            for volume_id in disconnected:
                self._disconnect_volume(volume_id)
        except VolumeException as ex:
            log.error(f'Failure when trying to disconnect volumes: {ex.message}')
        except JSONRPCException as ex:
            log.error(f'Failed to retrieve bdevs: {ex.message}')

    def _get_discovery_info(self):
        try:
@@ -102,6 +157,7 @@ class VolumeManager:
            raise VolumeException(grpc.StatusCode.INTERNAL,
                                  'Failed to stop discovery')

    @_locked
    def connect_volume(self, params, device_handle=None):
        """ Connects a volume through a discovery service.  Returns a tuple (volume_id, existing):
        the first item is a volume_id as str, while the second denotes whether the selected volume
@@ -173,8 +229,7 @@ class VolumeManager:
            raise ex
        return volume_id, False

    def disconnect_volume(self, volume_id):
        """Disconnects a volume connected through discovery service"""
    def _disconnect_volume(self, volume_id):
        id = format_volume_id(volume_id)
        if id is None:
            raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT,
@@ -193,6 +248,12 @@ class VolumeManager:
                log.error(f'Failed to stop discovery service: {name}')
        del self._volumes[id]

    @_locked
    def disconnect_volume(self, volume_id):
        """Disconnects a volume connected through discovery service"""
        return self._disconnect_volume(volume_id)

    @_locked
    def set_device(self, volume_id, device_handle):
        """Marks a previously connected volume as being attached to specified device.  This is only
        necessary if the device handle is not known at a time a volume is connected.
@@ -210,8 +271,9 @@ class VolumeManager:
                                  'Volume is already attached to a different device')
        volume.device_handle = device_handle

    @_locked
    def disconnect_device_volumes(self, device_handle):
        """Disconnects all volumes attached to a specific device"""
        volumes = [i for i, v in self._volumes.items() if v.device_handle == device_handle]
        for volume_id in volumes:
            self.disconnect_volume(volume_id)
            self._disconnect_volume(volume_id)
+2 −1
Original line number Diff line number Diff line
@@ -33,7 +33,8 @@ def parse_argv():
    defaults = {'address': 'localhost',
                'socket': '/var/tmp/spdk.sock',
                'port': 8080,
                'discovery_timeout': 10.0}
                'discovery_timeout': 10.0,
                'volume_cleanup_period': 60.0}
    # Merge the default values, config file, and the command-line
    args = vars(parser.parse_args())
    config = parse_config(args.get('config'))
+31 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ t2dscport2=8011
t1nqn='nqn.2016-06.io.spdk:node1'
t2nqn='nqn.2016-06.io.spdk:node2'
hostnqn='nqn.2016-06.io.spdk:host0'
cleanup_period=1

function cleanup() {
	killprocess $smapid
@@ -141,6 +142,7 @@ tgtpid=$!
$rootdir/scripts/sma.py -c <(
	cat <<- EOF
		discovery_timeout: 5
		volume_cleanup_period: $cleanup_period
		devices:
		  - name: 'nvmf_tcp'
	EOF
@@ -398,6 +400,35 @@ NOT attach_volume $device_id $(uuidgen) $invalid_port
[[ $($rpc_py bdev_nvme_get_discovery_info | jq -r '. | length') -eq 1 ]]
$rpc_py bdev_nvme_get_discovery_info | jq -r '.[].trid.trsvcid' | grep $t1dscport

# Make sure that the discovery service is stopped if a volume is disconnected outside of SMA (e.g.
# by removing it from the target)
$rpc_py -s $t1sock nvmf_subsystem_remove_ns $t1nqn 1
# Give SMA some time to be notified about the change
sleep $((cleanup_period + 1))
[[ $($rpc_py bdev_nvme_get_discovery_info | jq -r '. | length') -eq 0 ]]
$rpc_py -s $t1sock nvmf_subsystem_add_ns $t1nqn $t1uuid

# Do the same, but this time attach two volumes and check that the discovery service is only
# stopped once both volumes are disconnected
attach_volume $device_id $t2uuid $t2dscport1
attach_volume $device_id $t2uuid2 $t2dscport1
[[ $($rpc_py nvmf_get_subsystems $localnqn | jq -r '.[].namespaces | length') -eq 2 ]]
[[ $($rpc_py bdev_nvme_get_discovery_info | jq -r '. | length') -eq 1 ]]
$rpc_py -s $t2sock nvmf_subsystem_remove_ns $t2nqn 2
# Give SMA some time to be notified about the change
sleep $((cleanup_period + 1))
# One of the volumes should be gone, but the discovery service should still be running
[[ $($rpc_py nvmf_get_subsystems $localnqn | jq -r '.[].namespaces | length') -eq 1 ]]
[[ $($rpc_py bdev_nvme_get_discovery_info | jq -r '. | length') -eq 1 ]]
$rpc_py -s $t2sock nvmf_subsystem_remove_ns $t2nqn 1
# Give SMA some time to be notified about the change
sleep $((cleanup_period + 1))
# Now that both are gone, the discovery service should be stopped too
[[ $($rpc_py nvmf_get_subsystems $localnqn | jq -r '.[].namespaces | length') -eq 0 ]]
[[ $($rpc_py bdev_nvme_get_discovery_info | jq -r '. | length') -eq 0 ]]
$rpc_py -s $t2sock nvmf_subsystem_add_ns $t2nqn $t2uuid
$rpc_py -s $t2sock nvmf_subsystem_add_ns $t2nqn $t2uuid2

delete_device $device_id

cleanup