Commit f08a6eeb authored by Darek Stojaczyk's avatar Darek Stojaczyk Committed by Jim Harris
Browse files

vhost: don't lock global vhost mutex when waiting for device start/stop



This fixes a potential deadlock:

Thread 2
 * stop_device()
   * lock(&g_spdk_vhost_mutex)
   * _spdk_vhost_event_send
     * sem_wait <- waiting for pending I/O on Thread 1 to complete

Thread 1
 * spdk_rpc_construct_vhost_blk_controller
   * lock(&g_spdk_vhost_mutex) <- prevents this thread from
                                  completing any I/O

Fixes #437

Change-Id: I50ab7bc6dcd161881650ff30362127e0069a3939
Signed-off-by: default avatarDarek Stojaczyk <dariusz.stojaczyk@intel.com>
Reviewed-on: https://review.gerrithub.io/396577


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarJim Harris <james.r.harris@intel.com>
parent 0f25ee97
Loading
Loading
Loading
Loading
+40 −7
Original line number Diff line number Diff line
@@ -877,6 +877,19 @@ spdk_vhost_event_async_fn(void *arg1, void *arg2)
		vdev = NULL;
	}

	if (vdev != NULL && vdev->lcore >= 0 &&
	    (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
		/* if vdev has been relocated to other core, it is no longer thread-safe
		 * to access its contents here. Even though we're running under global vhost
		 * mutex, the controller itself (and its pollers) are not. We need to chase
		 * the vdev thread as many times as necessary.
		 */
		ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_async_fn, arg1, arg2);
		spdk_event_call(ev);
		pthread_mutex_unlock(&g_spdk_vhost_mutex);
		return;
	}

	ctx->cb_fn(vdev, arg2);
	pthread_mutex_unlock(&g_spdk_vhost_mutex);

@@ -901,14 +914,32 @@ spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
	}

	vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id);
	if (vdev == ctx->vdev) {
		ctx->cb_fn(vdev, arg2);
	} else {
	if (vdev != ctx->vdev) {
		/* ctx->vdev is probably a dangling pointer at this point.
		 * It must have been removed in the meantime, so we just skip
		 * it in our foreach chain. */
		goto out_unlock_continue;
	}

	/* the assert is just for static analyzers, vdev cannot be NULL here */
	assert(vdev != NULL);
	if (vdev->lcore >= 0 &&
	    (uint32_t)vdev->lcore != spdk_env_get_current_core()) {
		/* if vdev has been relocated to other core, it is no longer thread-safe
		 * to access its contents here. Even though we're running under global vhost
		 * mutex, the controller itself (and its pollers) are not. We need to chase
		 * the vdev thread as many times as necessary.
		 */
		ev = spdk_event_allocate(vdev->lcore,
					 spdk_vhost_event_async_foreach_fn, arg1, arg2);
		spdk_event_call(ev);
		pthread_mutex_unlock(&g_spdk_vhost_mutex);
		return;
	}

	ctx->cb_fn(vdev, arg2);

out_unlock_continue:
	vdev = spdk_vhost_dev_next(ctx->vdev_id);
	spdk_vhost_external_event_foreach_continue(vdev, ctx->cb_fn, arg2);
	pthread_mutex_unlock(&g_spdk_vhost_mutex);
@@ -917,7 +948,7 @@ spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2)
}

static int
spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
_spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
		       unsigned timeout_sec, const char *errmsg)
{
	struct spdk_vhost_dev_event_ctx ev_ctx = {0};
@@ -936,6 +967,7 @@ spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
	ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL);
	assert(ev);
	spdk_event_call(ev);
	pthread_mutex_unlock(&g_spdk_vhost_mutex);

	clock_gettime(CLOCK_REALTIME, &timeout);
	timeout.tv_sec += timeout_sec;
@@ -947,6 +979,7 @@ spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn,
	}

	sem_destroy(&ev_ctx.sem);
	pthread_mutex_lock(&g_spdk_vhost_mutex);
	return ev_ctx.response;
}

@@ -999,7 +1032,7 @@ stop_device(int vid)
		return;
	}

	rc = spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device");
	rc = _spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device");
	if (rc != 0) {
		SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vid);
		pthread_mutex_unlock(&g_spdk_vhost_mutex);
@@ -1089,7 +1122,7 @@ start_device(int vid)

	vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask);
	spdk_vhost_dev_mem_register(vdev);
	rc = spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device");
	rc = _spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device");
	if (rc != 0) {
		spdk_vhost_dev_mem_unregister(vdev);
		free(vdev->mem);