[Libguestfs] [libnbd PATCH 2/3] states: Skip server replies to unknown command cookies

Eric Blake eblake at redhat.com
Fri Aug 12 21:04:00 UTC 2022


Instead of killing the connection when the server gives us an
unexpected structured or error reply for a cookie we don't recognize,
we can log the issue and otherwise ignore that packet (an unexpected
successful simple reply is harder if we did not negotiate structured
replies; see the lengthy comment in states-reply-simple.c).  A
compliant server will never do this to us, but a transmission error
might.  Here's a quick patch to nbdkit to demonstrate such a scenario:

| diff --git i/server/protocol.c w/server/protocol.c
| index 2ac77055..55b27e2e 100644
| --- i/server/protocol.c
| +++ w/server/protocol.c
| @@ -742,7 +742,7 @@ protocol_recv_request_send_reply (void)
|        (cmd == NBD_CMD_READ || cmd == NBD_CMD_BLOCK_STATUS)) {
|      if (!error) {
|        if (cmd == NBD_CMD_READ)
| -        return send_structured_reply_read (request.handle, cmd,
| +        return send_structured_reply_read (0x100^request.handle, cmd,
|                                             buf, count, offset);
|        else /* NBD_CMD_BLOCK_STATUS */
|          return send_structured_reply_block_status (request.handle,

Note that if a server bit-flips a cookie, our original command waiting
for the right cookie may never get a completion packet from the
server; thus, testing this requires aio commands (unless we someday
add APIs to make it easier for blocking commands that give up when
they hit a timeout).  With the following command line and the hack to
nbdkit above:

$ ./run /path/to/nbdkit -U - memory 1M --run 'nbdsh -u "$uri" -c "\
buf=nbd.Buffer(1)
cookie=h.aio_pread(buf, 0)
try:
  print(h.poll(1))
except nbd.Error as ex:
  print(ex.string)
h.flush()
print(h.get_size())
"'

pre-patch we see that the connection is killed when h.poll()
encounters the garbage reply, before we get to the second command:

nbd_poll: no matching cookie found for server reply, this is probably a bug in the server
nbdkit: memory.1: error: read request: Connection reset by peer
nbdsh: command line script failed: nbd_flush: invalid state: DEAD: the handle must be connected with the server: Invalid argument

while post-patch the read never retires, but the unexpected cookie is
skipped and we get to the flush:

1
1048576

Watching the LIBNBD_DEBUG=1 stream, we also see:

libnbd: debug: nbdsh: nbd_poll: skipped reply for unexpected cookie 281474976710657, this is probably a bug in the server
---
 generator/states-reply-simple.c     | 25 ++++++++++++++++++++++---
 generator/states-reply-structured.c | 24 +++++++++++++-----------
 generator/states-reply.c            | 27 ++++++++++++---------------
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/generator/states-reply-simple.c b/generator/states-reply-simple.c
index 2a7b9a9..24f7efa 100644
--- a/generator/states-reply-simple.c
+++ b/generator/states-reply-simple.c
@@ -25,10 +25,29 @@ STATE_MACHINE {
   uint64_t cookie;

   error = be32toh (h->sbuf.simple_reply.error);
-  cookie = be64toh (h->sbuf.simple_reply.handle);

-  assert (cmd);
-  assert (cmd->cookie == cookie);
+  if (cmd == NULL) {
+    /* Unexpected reply.  If error was set or we have structured
+     * replies, we know there should be no payload, so the next byte
+     * on the wire (if any) will be another reply, and we can let
+     * FINISH_COMMAND diagnose/ignore the server bug.  If not, we lack
+     * context to know whether the server thinks it was responding to
+     * NBD_CMD_READ, so it is safer to move to DEAD now than to risk
+     * consuming a server's potential data payload as a reply stream
+     * (even though we would be likely to produce a magic number
+     * mismatch on the next pass that would also move us to DEAD).
+     */
+    if (error || h->structured_replies)
+      SET_NEXT_STATE (%^FINISH_COMMAND);
+    else {
+      cookie = be64toh (h->sbuf.simple_reply.handle);
+      SET_NEXT_STATE (%.DEAD);
+      set_error (EPROTO,
+                 "no matching cookie %" PRIu64 " found for server reply, "
+                 "this is probably a server bug", cookie);
+    }
+    return 0;
+  }

   if (cmd->type == NBD_CMD_READ && h->structured_replies) {
     set_error (0, "server sent unexpected simple reply for read");
diff --git a/generator/states-reply-structured.c b/generator/states-reply-structured.c
index 2db2cf6..4fbd62f 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -69,17 +69,12 @@ STATE_MACHINE {
  REPLY.STRUCTURED_REPLY.CHECK:
   struct command *cmd = h->reply_cmd;
   uint16_t flags, type;
-  uint64_t cookie;
   uint32_t length;

   flags = be16toh (h->sbuf.sr.structured_reply.flags);
   type = be16toh (h->sbuf.sr.structured_reply.type);
-  cookie = be64toh (h->sbuf.sr.structured_reply.handle);
   length = be32toh (h->sbuf.sr.structured_reply.length);

-  assert (cmd);
-  assert (cmd->cookie == cookie);
-
   /* Reject a server that replies with too much information, but don't
    * reject a single structured reply to NBD_CMD_READ on the largest
    * size we were willing to send. The most likely culprit is a server
@@ -88,12 +83,13 @@ STATE_MACHINE {
    * not worth keeping the connection alive.
    */
   if (length > MAX_REQUEST_SIZE + sizeof h->sbuf.sr.payload.offset_data) {
-    set_error (0, "invalid server reply length");
+    set_error (0, "invalid server reply length %" PRIu32, length);
     SET_NEXT_STATE (%.DEAD);
     return 0;
   }

-  if (!h->structured_replies) {
+  /* Skip an unexpected structured reply, including to an unknown cookie. */
+  if (cmd == NULL || !h->structured_replies) {
   resync:
     h->rbuf = NULL;
     h->rlen = length;
@@ -487,7 +483,6 @@ STATE_MACHINE {
   uint16_t type;
   uint32_t length;

-  assert (cmd);
   assert (h->rbuf == NULL);
   switch (recv_into_rbuf (h)) {
   case -1: SET_NEXT_STATE (%.DEAD); return 0;
@@ -496,6 +491,15 @@ STATE_MACHINE {
     SET_NEXT_STATE (%.READY);
     return 0;
   case 0:
+    /* If this reply is to an unknown command, FINISH_COMMAND will
+     * diagnose and ignore the server bug.  Otherwise, ensure the
+     * pending command sees a failure of EPROTO if it does not already
+     * have an error.
+     */
+    if (cmd == NULL) {
+      SET_NEXT_STATE (%^FINISH_COMMAND);
+      return 0;
+    }
     type = be16toh (h->sbuf.sr.structured_reply.type);
     length = be32toh (h->sbuf.sr.structured_reply.length);
     debug (h, "unexpected reply type %u or payload length %" PRIu32
@@ -505,10 +509,8 @@ STATE_MACHINE {
     if (cmd->error == 0)
       cmd->error = EPROTO;
     SET_NEXT_STATE (%FINISH);
-    return 0;
-  default:
-    abort ();
   }
+  return 0;

  REPLY.STRUCTURED_REPLY.FINISH:
   uint16_t flags;
diff --git a/generator/states-reply.c b/generator/states-reply.c
index 1a3a20a..0736342 100644
--- a/generator/states-reply.c
+++ b/generator/states-reply.c
@@ -124,7 +124,7 @@ STATE_MACHINE {
   }
   else {
     SET_NEXT_STATE (%.DEAD); /* We've probably lost synchronization. */
-    set_error (0, "invalid reply magic");
+    set_error (0, "invalid reply magic 0x%" PRIx32, magic);
     return 0;
   }

@@ -132,22 +132,13 @@ STATE_MACHINE {
    * handle (our cookie) is stored at the same offset.
    */
   cookie = be64toh (h->sbuf.simple_reply.handle);
-  /* Find the command amongst the commands in flight. */
+  /* Find the command amongst the commands in flight. If the server sends
+   * a reply for an unknown cookie, FINISH will diagnose that later.
+   */
   for (cmd = h->cmds_in_flight; cmd != NULL; cmd = cmd->next) {
     if (cmd->cookie == cookie)
       break;
   }
-  if (cmd == NULL) {
-    /* An unexpected structured reply could be skipped, since it
-     * includes a length; similarly an unexpected simple reply can be
-     * skipped if we assume it was not a read. However, it's more
-     * likely we've lost synchronization with the server.
-     */
-    SET_NEXT_STATE (%.DEAD);
-    set_error (0, "no matching cookie found for server reply, "
-               "this is probably a bug in the server");
-    return 0;
-  }
   h->reply_cmd = cmd;
   return 0;

@@ -167,10 +158,16 @@ STATE_MACHINE {
     if (cmd->cookie == cookie)
       break;
   }
-  assert (cmd != NULL);
   assert (h->reply_cmd == cmd);
-  h->reply_cmd = NULL;
+  if (cmd == NULL) {
+    debug (h, "skipped reply for unexpected cookie %" PRIu64
+           ", this is probably a bug in the server", cookie);
+    SET_NEXT_STATE (%.READY);
+    return 0;
+  }
+
   retire = cmd->type == NBD_CMD_DISC;
+  h->reply_cmd = NULL;

   /* Notify the user */
   if (CALLBACK_IS_NOT_NULL (cmd->cb.completion)) {
-- 
2.37.1



More information about the Libguestfs mailing list