[Libguestfs] [libnbd PATCH] tests: Make errors more robust under load

Wed Jul 3 02:29:19 UTC 2019

When run under valgrind, the 'errors' test would sometimes fail
because a single NBD_CMD_WRITE managed to actually send() the entire
packet to the server without blocking the state machine.  To make
things more robust, switch to a server which is serialized (memory is
parallel, but sh is serial), and which intentionally does not read a
second command until the first is processed, then switch the client to
send two commands rather than one (whether or not the first loses the
race with blocking, we can now guarantee the second will block).

Since we are now writing a shell script, it is also easy to use that
script to add another improvement: check that a server error
propagates back to the client as expected. Surprisingly, this
uncovered an nbdkit bug (now fixed) - it was very difficult to coerce
nbdkit sh into failing with anything other than EIO, because it
mistakenly used strcmp() for output ending in a trailing space but not
a trailing newline.
---

I'm pushing this now, because it at least solved the valgrind failure
for me (the failure wansn't 100% pre-patch, but common enough at least
once in 10 runs, where now I went 50 runs without the failure).

 tests/errors.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 4 deletions(-)

diff --git a/tests/errors.c b/tests/errors.c
index faa1488..b4ff665 100644
--- a/tests/errors.c
+++ b/tests/errors.c
@@ -24,6 +24,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <unistd.h>
+#include <sys/stat.h>

 #include <libnbd.h>

@@ -53,15 +55,73 @@ check (int experr, const char *prefix)
   }
 }

+static char script[] = "/tmp/libnbd-errors-scriptXXXXXX";
+static char witness[] = "/tmp/libnbd-errors-witnessXXXXXX";
+static int script_fd = -1, witness_fd = -1;
+
+static void
+cleanup (void)
+{
+  if (script_fd != -1) {
+    if (script_fd >= 0)
+      close (script_fd);
+    unlink (script);
+  }
+  if (witness_fd >= 0) {
+    close (witness_fd);
+    unlink (witness);
+  }
+}
+
 int
 main (int argc, char *argv[])
 {
   struct nbd_handle *nbd;
-  const char *cmd[] = { "nbdkit", "-s", "--exit-with-parent", "memory",
-                        "size=128m", NULL };
+  /* We will connect to a custom nbdkit sh plugin which always fails
+   * on reads (with a precise spelling required for older nbdkit), and
+   * which delays responding to writes until a witness file no longer
+   * exists.
+   */
+  const char *cmd[] = { "nbdkit", "-s", "--exit-with-parent", "sh",
+                        script, NULL };

   progname = argv[0];

+  if (atexit (cleanup) != 0) {
+    perror ("atexit");
+    exit (EXIT_FAILURE);
+  }
+  if ((script_fd = mkstemp (script)) == -1) {
+    perror ("mkstemp");
+    exit (EXIT_FAILURE);
+  }
+  if ((witness_fd = mkstemp (witness)) == -1) {
+    perror ("mkstemp");
+    exit (EXIT_FAILURE);
+  }
+
+  if (dprintf (script_fd, "case $1 in\n"
+               "  get_size) echo 128m || exit 1 ;;\n"
+               "  pread) printf 'ENOMEM ' >&2; exit 1 ;;\n"
+               "  can_write) exit 0 ;;\n"
+               "  pwrite)\n"
+               "    while test -e %s; do sleep 1; done\n"
+               "    exit 0;;\n"
+               "  *) exit 2 ;;\n"
+               "esac\n", witness) < 0) {
+    perror ("dprintf");
+    exit (EXIT_FAILURE);
+  }
+  if (fchmod (script_fd, 0700) == -1) {
+    perror ("fchmod");
+    exit (EXIT_FAILURE);
+  }
+  if (close (script_fd) == -1) {  /* Unlinked later during atexit */
+    perror ("close");
+    exit (EXIT_FAILURE);
+  }
+  script_fd = -2;
+
   nbd = nbd_create ();
   if (nbd == NULL) {
     fprintf (stderr, "%s\n", nbd_get_error ());
@@ -167,13 +227,29 @@ main (int argc, char *argv[])
   }
   check (ERANGE, "nbd_aio_pwrite: ");

-  /* Queue up a write command so large that we block on POLLIN, then queue
-   * multiple disconnects.
+  /* Send a read that the nbdkit sh plugin will fail. */
+  if (nbd_pread (nbd, buf, 512, 0, 0) != -1) {
+    fprintf (stderr, "%s: test failed: "
+             "nbd_pread did not report server failure\n",
+             argv[0]);
+    exit (EXIT_FAILURE);
+  }
+  check (ENOMEM, "nbd_pread: ");
+
+  /* Queue up two write commands so large that we block on POLLIN (the
+   * first might not block when under load, such as valgrind, but the
+   * second definitely will, since the nbdkit sh plugin reads only one
+   * command at a time and stalls on the first), then queue multiple
+   * disconnects.
    */
   if (nbd_aio_pwrite (nbd, buf, 2 * 1024 * 1024, 0, 0) == -1) {
     fprintf (stderr, "%s: %s\n", argv[0], nbd_get_error ());
     exit (EXIT_FAILURE);
   }
+  if (nbd_aio_pwrite (nbd, buf, 2 * 1024 * 1024, 0, 0) == -1) {
+    fprintf (stderr, "%s: %s\n", argv[0], nbd_get_error ());
+    exit (EXIT_FAILURE);
+  }
   if ((nbd_aio_get_direction (nbd) & LIBNBD_AIO_DIRECTION_WRITE) == 0) {
     fprintf (stderr, "%s: test failed: "
              "expect to be blocked on write\n",
@@ -192,6 +268,17 @@ main (int argc, char *argv[])
   }
   check (EINVAL, "nbd_aio_disconnect: ");

+  /* Unblock the nbdkit sh plugin */
+  if (close (witness_fd) == -1) {
+    perror ("close");
+    exit (EXIT_FAILURE);
+  }
+  witness_fd = -1;
+  if (unlink (witness) == -1) {
+    perror ("unlink");
+    exit (EXIT_FAILURE);
+  }
+
   /* Flush the queue (whether this one fails is a race with how fast
    * the server shuts down, so don't enforce status), then try to send
    * another command while CLOSED/DEAD
-- 
2.20.1