[Libguestfs] [libnbd PATCH] tests: Add test for abrupt server death

Eric Blake eblake at redhat.com
Thu Jun 27 05:57:55 UTC 2019


It's worth testing that our transition to the DEAD state works as
expected, by intentionally killing a server.

This test also makes it possible to test what happens when pending
commands are stranded, so that an upcoming patch to add notifiers can
show the difference it makes.

The test was surprisingly hard to write. For starters, sending SIGINT
to nbdkit was sometimes enough to kill the process (if it hadn't yet
read the NBD_CMD_READ, and therefore did try to wait for any
outstanding requests before quitting), but often it did not (because
nbdkit was stuck waiting for pthread_join()). Then there's the race
that nbd_poll() can sometimes get lucky enough to catch a POLLHUP in
REPLY.START where recv() returning 0 transitions things to CLOSED, but
more often catches a POLLERR and transitions to DEAD.  Then there was
the hour I spent scratching my head why kill didn't seem to get rid of
the child process even though the poll() was definitely seeing the fd
closing, until I remembered that kill(pid,0) to zombie processes
succeeds until you wait() or alter SIGCHLD to specifically prevent
zombies.  And debugging the now-fixed uninitialized variable in
nbd_unlocked_poll added to the mix.

I'm not 100% sure the test is portable to non-Linux - I guess we'll
eventually find out when someone worries about a BSD port of libnbd.
---

Applies on top of my nbd_poll return value semantic change.

I'm open to bike-shedding on the test name.

 .gitignore         |   1 +
 tests/Makefile.am  |   7 +++
 tests/disconnect.c | 143 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 tests/disconnect.c

diff --git a/.gitignore b/.gitignore
index ea496ac..2b8092c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,6 +109,7 @@ Makefile.in
 /tests/connect-unix
 /tests/connect-uri-tcp
 /tests/connect-uri-unix
+/tests/disconnect
 /tests/errors
 /tests/functions.sh
 /tests/get-size
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0314d91..bbd7330 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -43,6 +43,7 @@ check_DATA =

 check_PROGRAMS = \
 	errors \
+	disconnect \
 	get-size \
 	read-only-flag \
 	read-write-flag \
@@ -77,6 +78,7 @@ LOG_COMPILER = $(top_builddir)/run

 TESTS = \
 	errors \
+	disconnect \
 	get-size \
 	read-only-flag \
 	read-write-flag \
@@ -108,6 +110,11 @@ errors_CPPFLAGS = -I$(top_srcdir)/include
 errors_CFLAGS = $(WARNINGS_CFLAGS)
 errors_LDADD = $(top_builddir)/lib/libnbd.la

+disconnect_SOURCES = disconnect.c
+disconnect_CPPFLAGS = -I$(top_srcdir)/include
+disconnect_CFLAGS = $(WARNINGS_CFLAGS)
+disconnect_LDADD = $(top_builddir)/lib/libnbd.la
+
 get_size_SOURCES = get-size.c
 get_size_CPPFLAGS = -I$(top_srcdir)/include
 get_size_CFLAGS = $(WARNINGS_CFLAGS)
diff --git a/tests/disconnect.c b/tests/disconnect.c
new file mode 100644
index 0000000..677d44b
--- /dev/null
+++ b/tests/disconnect.c
@@ -0,0 +1,143 @@
+/* NBD client library in userspace
+ * Copyright (C) 2013-2019 Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Deliberately disconnect while stranding commands, to check their status.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include <libnbd.h>
+
+int
+main (int argc, char *argv[])
+{
+  struct nbd_handle *nbd;
+  char buf[512];
+  int64_t handle;
+  char pidfile[] = "/tmp/libnbd-test-disconnectXXXXXX";
+  int fd;
+  pid_t pid;
+  int r;
+  int delay = 0;
+  const char *cmd[] = { "nbdkit", "--pidfile", pidfile, "-s",
+                        "--exit-with-parent", "--filter=delay", "memory",
+                        "size=1m", "delay-reads=5", NULL };
+
+  /* We're going to kill the child, but don't want to wait for a zombie */
+  if (signal (SIGCHLD, SIG_IGN) == SIG_ERR) {
+    fprintf (stderr, "%s: signal: %s\n", argv[0], strerror (errno));
+    exit (EXIT_FAILURE);
+  }
+
+  fd = mkstemp (pidfile);
+  if (fd < 0) {
+    fprintf (stderr, "%s: mkstemp: %s\n", argv[0], strerror (errno));
+    exit (EXIT_FAILURE);
+  }
+
+  nbd = nbd_create ();
+  if (nbd == NULL) {
+    fprintf (stderr, "%s: %s\n", argv[0], nbd_get_error ());
+    goto fail;
+  }
+
+  /* Connect to a slow server. */
+  if (nbd_connect_command (nbd, (char **) cmd) == -1) {
+    fprintf (stderr, "%s: %s\n", argv[0], nbd_get_error ());
+    goto fail;
+  }
+  if (read (fd, buf, sizeof buf) == -1) {
+    fprintf (stderr, "%s: read: %s\n", argv[0], strerror (errno));
+    goto fail;
+  }
+  pid = atoi (buf);
+  if (pid <= 0) {
+    fprintf (stderr, "%s: unable to parse server's pid\n", argv[0]);
+    goto fail;
+  }
+
+  /* Issue a read that should not complete yet. */
+  if ((handle = nbd_aio_pread (nbd, buf, sizeof buf, 0, 0)) == -1) {
+    fprintf (stderr, "%s: test failed: nbd_aio_pread\n", argv[0]);
+    goto fail;
+  }
+  if (nbd_aio_peek_command_completed (nbd) != 0) {
+    fprintf (stderr, "%s: test failed: nbd_aio_peek_command_completed\n",
+             argv[0]);
+    goto fail;
+  }
+  if (nbd_aio_command_completed (nbd, handle) != 0) {
+    fprintf (stderr, "%s: test failed: nbd_aio_command_completed\n", argv[0]);
+    goto fail;
+  }
+
+  /* Kill the server forcefully (SIGINT is not always strong enough,
+   * as nbdkit waits for pending transactions to finish before
+   * actually exiting), although it's a race whether our signal
+   * arrives while nbdkit has a pending transaction.
+   */
+  if (kill (pid, SIGKILL) == -1) {
+    fprintf (stderr, "%s: kill: %s\n", argv[0], strerror (errno));
+    goto fail;
+  }
+  /* Wait up to 10 seconds, 100 ms at a time */
+  while (kill (pid, 0) == 0) {
+    if (delay++ > 100) {
+      fprintf (stderr, "%s: kill taking too long\n", argv[0]);
+      goto fail;
+    }
+    usleep (100 * 1000);
+  }
+
+  /* This part is somewhat racy if we don't wait for the process death
+   * above - depending on load and timing, nbd_poll may succeed or
+   * fail, and we may transition to either CLOSED (the state machine
+   * saw a clean EOF) or DEAD (the state machine saw a stranded
+   * transaction or POLLERR).
+   */
+  while ((r = nbd_poll (nbd, 1000)) == 1)
+    if (nbd_aio_is_dead (nbd) || nbd_aio_is_closed (nbd))
+      break;
+  if (!(nbd_aio_is_dead (nbd) || nbd_aio_is_closed (nbd))) {
+    fprintf (stderr, "%s: test failed: server death not detected\n", argv[0]);
+    goto fail;
+  }
+
+  /* Proof that the read was stranded */
+  if (nbd_aio_peek_command_completed (nbd) != 0) {
+    fprintf (stderr, "%s: test failed: nbd_aio_peek_command_completed\n",
+             argv[0]);
+    goto fail;
+  }
+  if (nbd_aio_command_completed (nbd, handle) != 0) {
+    fprintf (stderr, "%s: test failed: nbd_aio_command_completed\n", argv[0]);
+    goto fail;
+  }
+
+  nbd_close (nbd);
+  exit (EXIT_SUCCESS);
+
+ fail:
+  unlink (pidfile);
+  exit (EXIT_FAILURE);
+}
-- 
2.20.1




More information about the Libguestfs mailing list