Commit e71e81b6 authored by Seth Howell's avatar Seth Howell Committed by Tomasz Zawadzki
Browse files

sock: keep track of removed sockets during call to poll



We have been intermittently hitting the assert where
we check sock->cb_fn != NULL in spdk_sock_group_impl_poll_count.

The only way we could be hitting this specific error is if we
wereremoving a socket from a sock group within after receiving
an event for it.

Specifically, we are seeing this error on the NVMe-oF TCP target
which relies on posix sockets using epoll.

The man page for epoll states the following:

 If you use an event cache or store all the file descriptors
 returned from epoll_wait(2), then make sure to provide
 a  way  to  mark its closure dynamically (i.e., caused by
 a previous event's processing).  Suppose you receive 100 events
 from epoll_wait(2), and in event #47 a condition causes event
 #13 to be closed.  If you remove  the  structure  and close(2)
 the file descriptor for event #13, then your event cache might
 still say there are events waiting for that file descriptor
 causing confusion.

 One solution for this is to call, during the processing
 of  event  47,  epoll_ctl(EPOLL_CTL_DEL)  to  delete  file
 descriptor  13 and close(2), then mark its associated data
 structure as removed and link it to a cleanup list.  If
 you find another event for file descriptor 13 in your batch
 processing, you will discover the file descriptor  had
 been previously removed and there will be no confusion.

Since we do store all of the file descriptors returned from
epoll_wait, we need to implement the tracking mentioned above.

fixes issue #1294

Signed-off-by: default avatarSeth Howell <seth.howell@intel.com>
Change-Id: Ib592ce19e3f0b691e3a825d02ebb42d7338e3ceb
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/1589


Tested-by: default avatarSPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: default avatarBen Walker <benjamin.walker@intel.com>
Reviewed-by: default avatarShuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
Reviewed-by: default avatarAleksey Marchuk <alexeymar@mellanox.com>
parent e9063d4d
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -77,6 +77,12 @@ struct spdk_sock_group_impl {
	struct spdk_net_impl			*net_impl;
	TAILQ_HEAD(, spdk_sock)			socks;
	STAILQ_ENTRY(spdk_sock_group_impl)	link;
	/* List of removed sockets. refreshed each time we poll the sock group. */
	int					num_removed_socks;
	/* Unfortunately, we can't just keep a tailq of the sockets in case they are freed
	 * or added to another poll group later.
	 */
	uintptr_t				removed_socks[MAX_EVENTS_PER_POLL];
};

struct spdk_net_impl {
+20 −2
Original line number Diff line number Diff line
@@ -404,6 +404,7 @@ spdk_sock_group_create(void *ctx)
		if (group_impl != NULL) {
			STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link);
			TAILQ_INIT(&group_impl->socks);
			group_impl->num_removed_socks = 0;
			group_impl->net_impl = impl;
		}
	}
@@ -500,6 +501,9 @@ spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *soc
	rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock);
	if (rc == 0) {
		TAILQ_REMOVE(&group_impl->socks, sock, link);
		assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL);
		group_impl->removed_socks[group_impl->num_removed_socks] = (uintptr_t)sock;
		group_impl->num_removed_socks++;
		sock->group_impl = NULL;
		sock->cb_fn = NULL;
		sock->cb_arg = NULL;
@@ -526,6 +530,9 @@ spdk_sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl,
		return 0;
	}

	/* The number of removed sockets should be reset for each call to poll. */
	group_impl->num_removed_socks = 0;

	num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks);
	if (num_events == -1) {
		return -1;
@@ -533,10 +540,21 @@ spdk_sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl,

	for (i = 0; i < num_events; i++) {
		struct spdk_sock *sock = socks[i];
		int j;
		bool valid = true;
		for (j = 0; j < group_impl->num_removed_socks; j++) {
			if ((uintptr_t)sock == group_impl->removed_socks[j]) {
				valid = false;
				break;
			}
		}

		if (valid) {
			assert(sock->cb_fn != NULL);
			sock->cb_fn(sock->cb_arg, group, sock);
		}
	}

	return num_events;
}