[Libguestfs] [PATCH nbdkit v5 FINAL 03/19] server: protocol: Implement Block Status "base:allocation".

Thu Mar 28 16:18:30 UTC 2019

This commit implements the protocol side of supporting
NBD_CMD_BLOCK_STATUS to read "base:allocation" information about
disks.  The practical result of this is that clients are able to read
which parts of a disk are allocated, holes or zeroes.
---
 docs/nbdkit-protocol.pod             |   5 +-
 common/protocol/protocol.h           |  15 ++
 server/internal.h                    |   8 +
 server/protocol-handshake-newstyle.c |  63 ++++++-
 server/protocol-handshake.c          |  11 ++
 server/protocol.c                    | 240 +++++++++++++++++++++++++--
 TODO                                 |   4 +-
 7 files changed, 328 insertions(+), 18 deletions(-)

diff --git a/docs/nbdkit-protocol.pod b/docs/nbdkit-protocol.pod
index 4c65e00..a9a3390 100644
--- a/docs/nbdkit-protocol.pod
+++ b/docs/nbdkit-protocol.pod
@@ -134,7 +134,10 @@ Supported in nbdkit E<ge> 1.11.8.
 
 =item Block Status
 
-I<Not supported>.
+Supported in nbdkit E<ge> 1.11.10.
+
+Only C<base:allocation> (ie. querying which parts of an image are
+sparse) is supported.
 
 =item Resize Extension
 
diff --git a/common/protocol/protocol.h b/common/protocol/protocol.h
index 06b917e..a7de2f0 100644
--- a/common/protocol/protocol.h
+++ b/common/protocol/protocol.h
@@ -112,6 +112,7 @@ extern const char *name_of_nbd_rep (int);
 #define NBD_REP_ACK          1
 #define NBD_REP_SERVER       2
 #define NBD_REP_INFO         3
+#define NBD_REP_META_CONTEXT 4
 #define NBD_REP_ERR_UNSUP    0x80000001
 #define NBD_REP_ERR_POLICY   0x80000002
 #define NBD_REP_ERR_INVALID  0x80000003
@@ -128,6 +129,18 @@ struct fixed_new_option_reply_info_export {
   uint16_t eflags;              /* per-export flags */
 } __attribute__((packed));
 
+/* NBD_REP_META_CONTEXT reply (follows fixed_new_option_reply). */
+struct fixed_new_option_reply_meta_context {
+  uint32_t context_id;          /* metadata context ID */
+  /* followed by a string */
+} __attribute__((packed));
+
+/* NBD_REPLY_TYPE_BLOCK_STATUS block descriptor. */
+struct block_descriptor {
+  uint32_t length;              /* length of block */
+  uint32_t status_flags;        /* block type (hole etc) */
+} __attribute__((packed));
+
 /* New-style handshake server reply when using NBD_OPT_EXPORT_NAME.
  * Modern clients use NBD_OPT_GO instead of this.
  */
@@ -199,10 +212,12 @@ extern const char *name_of_nbd_cmd (int);
 #define NBD_CMD_FLUSH             3
 #define NBD_CMD_TRIM              4
 #define NBD_CMD_WRITE_ZEROES      6
+#define NBD_CMD_BLOCK_STATUS      7
 
 extern const char *name_of_nbd_cmd_flag (int);
 #define NBD_CMD_FLAG_FUA      (1<<0)
 #define NBD_CMD_FLAG_NO_HOLE  (1<<1)
+#define NBD_CMD_FLAG_REQ_ONE  (1<<3)
 
 /* Error codes (previously errno).
  * See http://git.qemu.org/?p=qemu.git;a=commitdiff;h=ca4414804114fd0095b317785bc0b51862e62ebb
diff --git a/server/internal.h b/server/internal.h
index ae51804..d804441 100644
--- a/server/internal.h
+++ b/server/internal.h
@@ -181,8 +181,10 @@ struct connection {
   bool can_zero;
   bool can_fua;
   bool can_multi_conn;
+  bool can_extents;
   bool using_tls;
   bool structured_replies;
+  bool meta_context_base_allocation;
 
   int sockin, sockout;
   connection_recv_function recv;
@@ -219,6 +221,12 @@ extern int protocol_handshake_newstyle (struct connection *conn)
 extern int protocol_recv_request_send_reply (struct connection *conn)
   __attribute__((__nonnull__ (1)));
 
+/* The context ID of base:allocation.  As far as I can tell it doesn't
+ * matter what this is as long as nbdkit always returns the same
+ * number.
+ */
+#define base_allocation_id 1
+
 /* crypto.c */
 #define root_tls_certificates_dir sysconfdir "/pki/" PACKAGE_NAME
 extern void crypto_init (bool tls_set_on_cli);
diff --git a/server/protocol-handshake-newstyle.c b/server/protocol-handshake-newstyle.c
index db01f7b..6899e6c 100644
--- a/server/protocol-handshake-newstyle.c
+++ b/server/protocol-handshake-newstyle.c
@@ -133,6 +133,34 @@ send_newstyle_option_reply_info_export (struct connection *conn,
   return 0;
 }
 
+static int
+send_newstyle_option_reply_meta_context (struct connection *conn,
+                                         uint32_t option, uint32_t reply,
+                                         uint32_t context_id,
+                                         const char *name)
+{
+  struct fixed_new_option_reply fixed_new_option_reply;
+  struct fixed_new_option_reply_meta_context context;
+  const size_t namelen = strlen (name);
+
+  fixed_new_option_reply.magic = htobe64 (NBD_REP_MAGIC);
+  fixed_new_option_reply.option = htobe32 (option);
+  fixed_new_option_reply.reply = htobe32 (reply);
+  fixed_new_option_reply.replylen = htobe32 (sizeof context + namelen);
+  context.context_id = htobe32 (context_id);
+
+  if (conn->send (conn,
+                  &fixed_new_option_reply,
+                  sizeof fixed_new_option_reply) == -1 ||
+      conn->send (conn, &context, sizeof context) == -1 ||
+      conn->send (conn, name, namelen) == -1) {
+    nbdkit_error ("write: %m");
+    return -1;
+  }
+
+  return 0;
+}
+
 /* Sub-function during negotiate_handshake_newstyle, to uniformly handle
  * a client hanging up on a message boundary.
  */
@@ -503,7 +531,15 @@ negotiate_handshake_newstyle_options (struct connection *conn)
          * for SET: nr_queries == 0 means reset all contexts
          */
         if (nr_queries == 0) {
-          /* Nothing is supported now. */
+          if (option == NBD_OPT_SET_META_CONTEXT)
+            conn->meta_context_base_allocation = false;
+          else /* LIST */ {
+            if (send_newstyle_option_reply_meta_context
+                (conn, option, NBD_REP_META_CONTEXT,
+                 0, "base:allocation") == -1)
+              return -1;
+          }
+
           if (send_newstyle_option_reply (conn, option, NBD_REP_ACK) == -1)
             return -1;
         }
@@ -525,7 +561,30 @@ negotiate_handshake_newstyle_options (struct connection *conn)
                    option == NBD_OPT_LIST_META_CONTEXT ? "query" : "set",
                    (int) querylen, &data[opt_index]);
 
-            /* Ignore query - nothing is supported. */
+            /* For LIST, "base:" returns all supported contexts in the
+             * base namespace.  We only support "base:allocation".
+             */
+            if (option == NBD_OPT_LIST_META_CONTEXT &&
+                querylen == 5 &&
+                strncmp (&data[opt_index], "base:", 5) == 0) {
+              if (send_newstyle_option_reply_meta_context
+                  (conn, option, NBD_REP_META_CONTEXT,
+                   0, "base:allocation") == -1)
+                return -1;
+            }
+            /* "base:allocation" requested by name. */
+            else if (querylen == 15 &&
+                     strncmp (&data[opt_index], "base:allocation", 15) == 0) {
+              if (send_newstyle_option_reply_meta_context
+                  (conn, option, NBD_REP_META_CONTEXT,
+                   option == NBD_OPT_SET_META_CONTEXT
+                   ? base_allocation_id : 0,
+                   "base:allocation") == -1)
+                return -1;
+              if (option == NBD_OPT_SET_META_CONTEXT)
+                conn->meta_context_base_allocation = true;
+            }
+            /* Every other query must be ignored. */
 
             opt_index += querylen;
             nr_queries--;
diff --git a/server/protocol-handshake.c b/server/protocol-handshake.c
index 79a5999..9653210 100644
--- a/server/protocol-handshake.c
+++ b/server/protocol-handshake.c
@@ -110,6 +110,17 @@ protocol_compute_eflags (struct connection *conn, uint16_t *flags)
     conn->can_multi_conn = true;
   }
 
+  /* The result of this is not returned to callers here (or at any
+   * time during the handshake).  However it makes sense to do it once
+   * per connection and store the result in the handle anyway.  This
+   * protocol_compute_eflags function is a bit misnamed XXX.
+   */
+  fl = backend->can_extents (backend, conn);
+  if (fl == -1)
+    return -1;
+  if (fl)
+    conn->can_extents = true;
+
   *flags = eflags;
   return 0;
 }
diff --git a/server/protocol.c b/server/protocol.c
index f117d42..383938f 100644
--- a/server/protocol.c
+++ b/server/protocol.c
@@ -36,6 +36,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <inttypes.h>
 #include <string.h>
 #include <unistd.h>
@@ -44,6 +45,7 @@
 
 #include "internal.h"
 #include "byte-swapping.h"
+#include "minmax.h"
 #include "protocol.h"
 
 /* Maximum read or write request that we will handle. */
@@ -78,6 +80,7 @@ validate_request (struct connection *conn,
   case NBD_CMD_WRITE:
   case NBD_CMD_TRIM:
   case NBD_CMD_WRITE_ZEROES:
+  case NBD_CMD_BLOCK_STATUS:
     if (!valid_range (conn, offset, count)) {
       /* XXX Allow writes to extend the disk? */
       nbdkit_error ("invalid request: %s: offset and count are out of range: "
@@ -106,7 +109,8 @@ validate_request (struct connection *conn,
   }
 
   /* Validate flags */
-  if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
+  if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE |
+                NBD_CMD_FLAG_REQ_ONE)) {
     nbdkit_error ("invalid request: unknown flag (0x%x)", flags);
     *error = EINVAL;
     return false;
@@ -117,6 +121,12 @@ validate_request (struct connection *conn,
     *error = EINVAL;
     return false;
   }
+  if ((flags & NBD_CMD_FLAG_REQ_ONE) &&
+      cmd != NBD_CMD_BLOCK_STATUS) {
+    nbdkit_error ("invalid request: REQ_ONE flag needs BLOCK_STATUS request");
+    *error = EINVAL;
+    return false;
+  }
   if (!conn->can_fua && (flags & NBD_CMD_FLAG_FUA)) {
     nbdkit_error ("invalid request: FUA flag not supported");
     *error = EINVAL;
@@ -157,14 +167,37 @@ validate_request (struct connection *conn,
     return false;
   }
 
+  /* Block status allowed? */
+  if (cmd == NBD_CMD_BLOCK_STATUS) {
+    if (!conn->structured_replies) {
+      nbdkit_error ("invalid request: "
+                    "%s: structured replies was not negotiated",
+                    name_of_nbd_cmd (cmd));
+      *error = EINVAL;
+      return false;
+    }
+    if (!conn->meta_context_base_allocation) {
+      nbdkit_error ("invalid request: "
+                    "%s: base:allocation was not negotiated",
+                    name_of_nbd_cmd (cmd));
+      *error = EINVAL;
+      return false;
+    }
+  }
+
   return true;                     /* Command validates. */
 }
 
 /* This is called with the request lock held to actually execute the
  * request (by calling the plugin).  Note that the request fields have
  * been validated already in 'validate_request' so we don't have to
- * check them again.  'buf' is either the data to be written or the
- * data to be returned, and points to a buffer of size 'count' bytes.
+ * check them again.
+ *
+ * 'buf' is either the data to be written or the data to be returned,
+ * and points to a buffer of size 'count' bytes.
+ *
+ * 'extents' is an empty extents list used for block status requests
+ * only.
  *
  * In all cases, the return value is the system errno value that will
  * later be converted to the nbd error to send back to the client (0
@@ -173,7 +206,7 @@ validate_request (struct connection *conn,
 static uint32_t
 handle_request (struct connection *conn,
                 uint16_t cmd, uint16_t flags, uint64_t offset, uint32_t count,
-                void *buf)
+                void *buf, struct nbdkit_extents *extents)
 {
   uint32_t f = 0;
   bool fua = conn->can_fua && (flags & NBD_CMD_FLAG_FUA);
@@ -217,6 +250,33 @@ handle_request (struct connection *conn,
       return err;
     break;
 
+  case NBD_CMD_BLOCK_STATUS:
+    /* The other backend methods don't check can_*.  That is because
+     * those methods are implicitly suppressed by returning eflags to
+     * the client.  However there is no eflag for extents so we must
+     * check it here.
+     */
+    if (conn->can_extents) {
+      if (flags & NBD_CMD_FLAG_REQ_ONE)
+        f |= NBDKIT_FLAG_REQ_ONE;
+      if (backend->extents (backend, conn, count, offset, f,
+                            extents, &err) == -1)
+        return err;
+    }
+    else {
+      int r;
+
+      /* By default it is safe assume that everything in the range is
+       * allocated.
+       */
+      errno = 0;
+      r = nbdkit_add_extent (extents, offset, count, 0 /* allocated data */);
+      if (r == -1)
+        return errno ? errno : EINVAL;
+      return 0;
+    }
+    break;
+
   default:
     abort ();
   }
@@ -359,6 +419,143 @@ send_structured_reply_read (struct connection *conn,
   return 1;                     /* command processed ok */
 }
 
+/* Convert a list of extents into NBD_REPLY_TYPE_BLOCK_STATUS blocks.
+ * The rules here are very complicated.  Read the spec carefully!
+ */
+static struct block_descriptor *
+extents_to_block_descriptors (struct nbdkit_extents *extents,
+                              uint16_t flags,
+                              uint32_t count, uint64_t offset,
+                              size_t *nr_blocks)
+{
+  const bool req_one = flags & NBD_CMD_FLAG_REQ_ONE;
+  const size_t nr_extents = nbdkit_extents_count (extents);
+  size_t i;
+  struct block_descriptor *blocks;
+
+  /* This is checked in server/plugins.c. */
+  assert (nr_extents >= 1);
+
+  /* We may send fewer than nr_extents blocks, but never more. */
+  blocks = calloc (req_one ? 1 : nr_extents, sizeof (struct block_descriptor));
+  if (blocks == NULL) {
+    nbdkit_error ("malloc");
+    return NULL;
+  }
+
+  if (req_one) {
+    const struct nbdkit_extent e = nbdkit_get_extent (extents, 0);
+
+    /* Checked as a side effect of how the extent list is created. */
+    assert (e.length > 0);
+
+    *nr_blocks = 1;
+
+    /* Must not exceed count of the original request. */
+    blocks[0].length = MIN (e.length, (uint64_t) count);
+    blocks[0].status_flags = e.type & 3;
+  }
+  else {
+    uint64_t pos = offset;
+
+    for (i = 0; i < nr_extents; ++i) {
+      const struct nbdkit_extent e = nbdkit_get_extent (extents, i);
+      uint64_t length;
+
+      if (i == 0)
+        assert (e.offset == offset);
+
+      /* Must not exceed UINT32_MAX. */
+      length = MIN (e.length, UINT32_MAX);
+      blocks[i].status_flags = e.type & 3;
+
+      pos += length;
+      if (pos > offset + count) /* this must be the last block */
+        break;
+
+      /* If we reach here then we must have consumed this whole
+       * extent.  This is currently true because the server only sends
+       * 32 bit requests, but if we move to 64 bit requests we will
+       * need to revisit this code so it can split extents into
+       * multiple blocks.  XXX
+       */
+      assert (e.length <= length);
+    }
+
+    *nr_blocks = i;
+  }
+
+#if 0
+  for (i = 0; i < *nr_blocks; ++i)
+    nbdkit_debug ("block status: sending block %" PRIu32 " type %" PRIu32,
+                  blocks[i].length, blocks[i].status_flags);
+#endif
+
+  /* Convert to big endian for the protocol. */
+  for (i = 0; i < *nr_blocks; ++i) {
+    blocks[i].length = htobe32 (blocks[i].length);
+    blocks[i].status_flags = htobe32 (blocks[i].status_flags);
+  }
+
+  return blocks;
+}
+
+static int
+send_structured_reply_block_status (struct connection *conn,
+                                    uint64_t handle,
+                                    uint16_t cmd, uint16_t flags,
+                                    uint32_t count, uint64_t offset,
+                                    struct nbdkit_extents *extents)
+{
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock);
+  struct structured_reply reply;
+  CLEANUP_FREE struct block_descriptor *blocks = NULL;
+  size_t nr_blocks;
+  uint32_t context_id;
+  size_t i;
+  int r;
+
+  assert (conn->meta_context_base_allocation);
+  assert (cmd == NBD_CMD_BLOCK_STATUS);
+
+  blocks = extents_to_block_descriptors (extents, flags, count, offset,
+                                         &nr_blocks);
+  if (blocks == NULL)
+    return connection_set_status (conn, -1);
+
+  reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC);
+  reply.handle = handle;
+  reply.flags = htobe16 (NBD_REPLY_FLAG_DONE);
+  reply.type = htobe16 (NBD_REPLY_TYPE_BLOCK_STATUS);
+  reply.length = htobe32 (sizeof context_id +
+                          nr_blocks * sizeof (struct block_descriptor));
+
+  r = conn->send (conn, &reply, sizeof reply);
+  if (r == -1) {
+    nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+    return connection_set_status (conn, -1);
+  }
+
+  /* Send the base:allocation context ID. */
+  context_id = htobe32 (base_allocation_id);
+  r = conn->send (conn, &context_id, sizeof context_id);
+  if (r == -1) {
+    nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+    return connection_set_status (conn, -1);
+  }
+
+  /* Send each block descriptor. */
+  for (i = 0; i < nr_blocks; ++i) {
+    r = conn->send (conn, &blocks[i], sizeof blocks[i]);
+    if (r == -1) {
+      nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd));
+      return connection_set_status (conn, -1);
+    }
+  }
+
+  return 1;                     /* command processed ok */
+}
+
 static int
 send_structured_reply_error (struct connection *conn,
                              uint64_t handle, uint16_t cmd, uint32_t error)
@@ -402,6 +599,7 @@ protocol_recv_request_send_reply (struct connection *conn)
   uint32_t magic, count, error = 0;
   uint64_t offset;
   CLEANUP_FREE char *buf = NULL;
+  CLEANUP_EXTENTS_FREE struct nbdkit_extents *extents = NULL;
 
   /* Read the request packet. */
   {
@@ -449,6 +647,7 @@ protocol_recv_request_send_reply (struct connection *conn)
     if (cmd == NBD_CMD_READ || cmd == NBD_CMD_WRITE) {
       buf = malloc (count);
       if (buf == NULL) {
+      out_of_memory:
         perror ("malloc");
         error = ENOMEM;
         if (cmd == NBD_CMD_WRITE &&
@@ -458,6 +657,13 @@ protocol_recv_request_send_reply (struct connection *conn)
       }
     }
 
+    /* Allocate the extents list for block status only. */
+    if (cmd == NBD_CMD_BLOCK_STATUS) {
+      extents = nbdkit_extents_new (offset, conn->exportsize);
+      if (extents == NULL)
+        goto out_of_memory;
+    }
+
     /* Receive the write data buffer. */
     if (cmd == NBD_CMD_WRITE) {
       r = conn->recv (conn, buf, count);
@@ -478,7 +684,7 @@ protocol_recv_request_send_reply (struct connection *conn)
   }
   else {
     lock_request (conn);
-    error = handle_request (conn, cmd, flags, offset, count, buf);
+    error = handle_request (conn, cmd, flags, offset, count, buf, extents);
     assert ((int) error >= 0);
     unlock_request (conn);
   }
@@ -498,15 +704,23 @@ protocol_recv_request_send_reply (struct connection *conn)
   }
 
   /* Currently we prefer to send simple replies for everything except
-   * where we have to (ie. NBD_CMD_READ when structured_replies have
-   * been negotiated).  However this prevents us from sending
-   * human-readable error messages to the client, so we should
-   * reconsider this in future.
+   * where we have to (ie. NBD_CMD_READ and NBD_CMD_BLOCK_STATUS when
+   * structured_replies have been negotiated).  However this prevents
+   * us from sending human-readable error messages to the client, so
+   * we should reconsider this in future.
    */
-  if (conn->structured_replies && cmd == NBD_CMD_READ) {
-    if (!error)
-      return send_structured_reply_read (conn, request.handle, cmd,
-                                         buf, count, offset);
+  if (conn->structured_replies &&
+      (cmd == NBD_CMD_READ || cmd == NBD_CMD_BLOCK_STATUS)) {
+    if (!error) {
+      if (cmd == NBD_CMD_READ)
+        return send_structured_reply_read (conn, request.handle, cmd,
+                                           buf, count, offset);
+      else /* NBD_CMD_BLOCK_STATUS */
+        return send_structured_reply_block_status (conn, request.handle,
+                                                   cmd, flags,
+                                                   count, offset,
+                                                   extents);
+    }
     else
       return send_structured_reply_error (conn, request.handle, cmd, error);
   }
diff --git a/TODO b/TODO
index 7cbc238..c968c2d 100644
--- a/TODO
+++ b/TODO
@@ -24,8 +24,8 @@ General ideas for improvements
   to inform nbdkit when the response is ready:
   https://www.redhat.com/archives/libguestfs/2018-January/msg00149.html
 
-* More NBD protocol features. In the upstream pipeline: proposals for
-  block status and online resize.
+* More NBD protocol features.  The only currently missing feature is
+  online resize.
 
 * Add a callback to let plugins request minimum alignment for the
   buffer to pread/pwrite; useful for a plugin utilizing O_DIRECT or
-- 
2.20.1