[Libguestfs] [PATCH v3 14/14] nbd/server: Add FLAG_PAYLOAD support to CMD_BLOCK_STATUS
Vladimir Sementsov-Ogievskiy
vsementsov at yandex-team.ru
Fri Jun 2 09:13:25 UTC 2023
On 15.05.23 22:53, Eric Blake wrote:
> Allow a client to request a subset of negotiated meta contexts. For
> example, a client may ask to use a single connection to learn about
> both block status and dirty bitmaps, but where the dirty bitmap
> queries only need to be performed on a subset of the disk; forcing the
> server to compute that information on block status queries in the rest
> of the disk is wasted effort (both at the server, and on the amount of
> traffic sent over the wire to be parsed and ignored by the client).
>
> Qemu as an NBD client never requests to use more than one meta
> context, so it has no need to use block status payloads. Testing this
> instead requires support from libnbd, which CAN access multiple meta
> contexts in parallel from a single NBD connection; an interop test
> submitted to the libnbd project at the same time as this patch
> demonstrates the feature working, as well as testing some corner cases
> (for example, when the payload length is longer than the export
> length), although other corner cases (like passing the same id
> duplicated) requires a protocol fuzzer because libnbd is not wired up
> to break the protocol that badly.
>
> This also includes tweaks to 'qemu-nbd --list' to show when a server
> is advertising the capability, and to the testsuite to reflect the
> addition to that output.
>
> Signed-off-by: Eric Blake <eblake at redhat.com>
> ---
> docs/interop/nbd.txt | 2 +-
> include/block/nbd.h | 32 ++++--
> nbd/server.c | 106 +++++++++++++++++-
> qemu-nbd.c | 1 +
> nbd/trace-events | 1 +
> tests/qemu-iotests/223.out | 12 +-
> tests/qemu-iotests/307.out | 10 +-
> .../tests/nbd-qemu-allocation.out | 2 +-
> 8 files changed, 136 insertions(+), 30 deletions(-)
>
> diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt
> index abaf4c28a96..83d85ce8d13 100644
> --- a/docs/interop/nbd.txt
> +++ b/docs/interop/nbd.txt
> @@ -69,4 +69,4 @@ NBD_CMD_BLOCK_STATUS for "qemu:dirty-bitmap:", NBD_CMD_CACHE
> NBD_CMD_FLAG_FAST_ZERO
> * 5.2: NBD_CMD_BLOCK_STATUS for "qemu:allocation-depth"
> * 7.1: NBD_FLAG_CAN_MULTI_CONN for shareable writable exports
> -* 8.1: NBD_OPT_EXTENDED_HEADERS
> +* 8.1: NBD_OPT_EXTENDED_HEADERS, NBD_FLAG_BLOCK_STATUS_PAYLOAD
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 6696d61bd59..3d8d7150121 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -175,6 +175,12 @@ typedef struct NBDExtentExt {
> uint64_t flags; /* NBD_STATE_* */
> } QEMU_PACKED NBDExtentExt;
>
> +/* Client payload for limiting NBD_CMD_BLOCK_STATUS reply */
> +typedef struct NBDBlockStatusPayload {
> + uint64_t effect_length;
> + /* uint32_t ids[] follows, array length implied by header */
> +} QEMU_PACKED NBDBlockStatusPayload;
> +
> /* Transmission (export) flags: sent from server to client during handshake,
> but describe what will happen during transmission */
> enum {
> @@ -191,20 +197,22 @@ enum {
> NBD_FLAG_SEND_RESIZE_BIT = 9, /* Send resize */
> NBD_FLAG_SEND_CACHE_BIT = 10, /* Send CACHE (prefetch) */
> NBD_FLAG_SEND_FAST_ZERO_BIT = 11, /* FAST_ZERO flag for WRITE_ZEROES */
> + NBD_FLAG_BLOCK_STAT_PAYLOAD_BIT = 12, /* PAYLOAD flag for BLOCK_STATUS */
> };
>
> -#define NBD_FLAG_HAS_FLAGS (1 << NBD_FLAG_HAS_FLAGS_BIT)
> -#define NBD_FLAG_READ_ONLY (1 << NBD_FLAG_READ_ONLY_BIT)
> -#define NBD_FLAG_SEND_FLUSH (1 << NBD_FLAG_SEND_FLUSH_BIT)
> -#define NBD_FLAG_SEND_FUA (1 << NBD_FLAG_SEND_FUA_BIT)
> -#define NBD_FLAG_ROTATIONAL (1 << NBD_FLAG_ROTATIONAL_BIT)
> -#define NBD_FLAG_SEND_TRIM (1 << NBD_FLAG_SEND_TRIM_BIT)
> -#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
> -#define NBD_FLAG_SEND_DF (1 << NBD_FLAG_SEND_DF_BIT)
> -#define NBD_FLAG_CAN_MULTI_CONN (1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
> -#define NBD_FLAG_SEND_RESIZE (1 << NBD_FLAG_SEND_RESIZE_BIT)
> -#define NBD_FLAG_SEND_CACHE (1 << NBD_FLAG_SEND_CACHE_BIT)
> -#define NBD_FLAG_SEND_FAST_ZERO (1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
> +#define NBD_FLAG_HAS_FLAGS (1 << NBD_FLAG_HAS_FLAGS_BIT)
> +#define NBD_FLAG_READ_ONLY (1 << NBD_FLAG_READ_ONLY_BIT)
> +#define NBD_FLAG_SEND_FLUSH (1 << NBD_FLAG_SEND_FLUSH_BIT)
> +#define NBD_FLAG_SEND_FUA (1 << NBD_FLAG_SEND_FUA_BIT)
> +#define NBD_FLAG_ROTATIONAL (1 << NBD_FLAG_ROTATIONAL_BIT)
> +#define NBD_FLAG_SEND_TRIM (1 << NBD_FLAG_SEND_TRIM_BIT)
> +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
> +#define NBD_FLAG_SEND_DF (1 << NBD_FLAG_SEND_DF_BIT)
> +#define NBD_FLAG_CAN_MULTI_CONN (1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
> +#define NBD_FLAG_SEND_RESIZE (1 << NBD_FLAG_SEND_RESIZE_BIT)
> +#define NBD_FLAG_SEND_CACHE (1 << NBD_FLAG_SEND_CACHE_BIT)
> +#define NBD_FLAG_SEND_FAST_ZERO (1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
> +#define NBD_FLAG_BLOCK_STAT_PAYLOAD (1 << NBD_FLAG_BLOCK_STAT_PAYLOAD_BIT)
>
> /* New-style handshake (global) flags, sent from server to client, and
> control what will happen during handshake phase. */
> diff --git a/nbd/server.c b/nbd/server.c
> index db550c82cd2..ce11285c0d7 100644
> --- a/nbd/server.c
> +++ b/nbd/server.c
> @@ -442,9 +442,9 @@ static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
> return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
> }
>
> -static void nbd_check_meta_export(NBDClient *client)
> +static void nbd_check_meta_export(NBDClient *client, NBDExport *exp)
> {
> - if (client->exp != client->context_exp) {
> + if (exp != client->context_exp) {
> client->contexts.count = 0;
> }
> }
> @@ -491,11 +491,15 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
> error_setg(errp, "export not found");
> return -EINVAL;
> }
> + nbd_check_meta_export(client, client->exp);
>
> myflags = client->exp->nbdflags;
> if (client->header_style >= NBD_HEADER_STRUCTURED) {
> myflags |= NBD_FLAG_SEND_DF;
> }
> + if (client->extended_headers && client->contexts.count) {
> + myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
> + }
> trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
> stq_be_p(buf, client->exp->size);
> stw_be_p(buf + 8, myflags);
> @@ -508,7 +512,6 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
>
> QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
> blk_exp_ref(&client->exp->common);
> - nbd_check_meta_export(client);
>
> return 0;
> }
> @@ -628,6 +631,9 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
> errp, "export '%s' not present",
> sane_name);
> }
> + if (client->opt == NBD_OPT_GO) {
> + nbd_check_meta_export(client, exp);
> + }
>
> /* Don't bother sending NBD_INFO_NAME unless client requested it */
> if (sendname) {
> @@ -681,6 +687,10 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
> if (client->header_style >= NBD_HEADER_STRUCTURED) {
> myflags |= NBD_FLAG_SEND_DF;
> }
> + if (client->extended_headers &&
> + (client->contexts.count || client->opt == NBD_OPT_INFO)) {
> + myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
> + }
> trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
> stq_be_p(buf, exp->size);
> stw_be_p(buf + 8, myflags);
> @@ -716,7 +726,6 @@ static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
> client->check_align = check_align;
> QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
> blk_exp_ref(&client->exp->common);
> - nbd_check_meta_export(client);
> rc = 1;
> }
> return rc;
> @@ -2415,6 +2424,83 @@ static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
> return nbd_co_send_extents(client, request, ea, last, context_id, errp);
> }
>
> +/*
> + * nbd_co_block_status_payload_read
> + * Called when a client wants a subset of negotiated contexts via a
> + * BLOCK_STATUS payload. Check the payload for valid length and
> + * contents. On success, return 0 with request updated to effective
> + * length. If request was invalid but payload consumed, return 0 with
> + * request->len and request->contexts.count set to 0 (which will
> + * trigger an appropriate NBD_EINVAL response later on).
Hmm. So, it leads to
case NBD_CMD_BLOCK_STATUS:
if (!request->len) {
return nbd_send_generic_reply(client, request, -EINVAL,
"need non-zero length", errp);
}
EINVAL is OK, but "need non-zero length" message is not appropriate.. I think we need separate reply for the case of invalid payload.
> On I/O
> + * error, return -EIO.
> + */
> +static int
> +nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
> + Error **errp)
> +{
> + int payload_len = request->len;
> + g_autofree char *buf = NULL;
> + g_autofree bool *bitmaps = NULL;
> + size_t count, i;
> + uint32_t id;
> +
> + assert(request->len <= NBD_MAX_BUFFER_SIZE);
> + if (payload_len % sizeof(uint32_t) ||
> + payload_len < sizeof(NBDBlockStatusPayload) ||
> + payload_len > (sizeof(NBDBlockStatusPayload) +
> + sizeof(id) * client->contexts.count)) {
> + goto skip;
> + }
> +
> + buf = g_malloc(payload_len);
> + if (nbd_read(client->ioc, buf, payload_len,
> + "CMD_BLOCK_STATUS data", errp) < 0) {
> + return -EIO;
> + }
> + trace_nbd_co_receive_request_payload_received(request->handle,
> + payload_len);
> + memset(&request->contexts, 0, sizeof(request->contexts));
> + request->contexts.nr_bitmaps = client->context_exp->nr_export_bitmaps;
> + bitmaps = g_new0(bool, request->contexts.nr_bitmaps);
> + count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
In doc we have MUST: "The payload form MUST occupy 8 + n*4 bytes", do we really want to forgive and ignore unaligned tail? May be better to "goto skip" in this case, to avoid ambiguity.
> + payload_len = 0;
> +
> + for (i = 0; i < count; i++) {
> +
> + id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
> + if (id == NBD_META_ID_BASE_ALLOCATION) {
> + if (request->contexts.base_allocation) {
> + goto skip;
> + }
> + request->contexts.base_allocation = true;
> + } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
> + if (request->contexts.allocation_depth) {
> + goto skip;
> + }
> + request->contexts.allocation_depth = true;
> + } else {
> + if (id - NBD_META_ID_DIRTY_BITMAP >
> + request->contexts.nr_bitmaps ||
> + bitmaps[id - NBD_META_ID_DIRTY_BITMAP]) {
> + goto skip;
> + }
> + bitmaps[id - NBD_META_ID_DIRTY_BITMAP] = true;
> + }
> + }
> +
> + request->len = ldq_be_p(buf);
> + request->contexts.count = count;
> + request->contexts.bitmaps = bitmaps;
> + bitmaps = NULL;
better I think:
request->context.bitmaps = g_steal_pointer(bitmaps);
- as a pair to g_autofree.
> + return 0;
> +
> + skip:
> + trace_nbd_co_receive_block_status_payload_compliance(request->from,
> + request->len);
> + request->len = request->contexts.count = 0;
> + return nbd_drop(client->ioc, payload_len, errp);
> +}
> +
> /* nbd_co_receive_request
> * Collect a client request. Return 0 if request looks valid, -EIO to drop
> * connection right away, -EAGAIN to indicate we were interrupted and the
> @@ -2461,7 +2547,14 @@ static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, NBDRequest *
>
> if (request->type == NBD_CMD_WRITE || extended_with_payload) {
> payload_len = request->len;
> - if (request->type != NBD_CMD_WRITE) {
> + if (request->type == NBD_CMD_BLOCK_STATUS) {
> + payload_len = nbd_co_block_status_payload_read(client,
> + request,
> + errp);
> + if (payload_len < 0) {
> + return -EIO;
> + }
Seems we can handle all payload in one "switch" block, instead of handling BLOCK_STATUS here and postponing WRITE payload (and dropping) up to the end of the block with help of payload_len variable.
> + } else if (request->type != NBD_CMD_WRITE) {
> /*
> * For now, we don't support payloads on other
> * commands; but we can keep the connection alive.
> @@ -2540,6 +2633,9 @@ static int coroutine_fn nbd_co_receive_request(NBDRequestData *req, NBDRequest *
> valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
> } else if (request->type == NBD_CMD_BLOCK_STATUS) {
> valid_flags |= NBD_CMD_FLAG_REQ_ONE;
> + if (client->extended_headers && client->contexts.count) {
> + valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
> + }
> }
> if (request->flags & ~valid_flags) {
> error_setg(errp, "unsupported flags for command %s (got 0x%x)",
> diff --git a/qemu-nbd.c b/qemu-nbd.c
> index 8c35442626a..b7ab0fdc791 100644
> --- a/qemu-nbd.c
> +++ b/qemu-nbd.c
> @@ -222,6 +222,7 @@ static int qemu_nbd_client_list(SocketAddress *saddr, QCryptoTLSCreds *tls,
> [NBD_FLAG_SEND_RESIZE_BIT] = "resize",
> [NBD_FLAG_SEND_CACHE_BIT] = "cache",
> [NBD_FLAG_SEND_FAST_ZERO_BIT] = "fast-zero",
> + [NBD_FLAG_BLOCK_STAT_PAYLOAD_BIT] = "block-status-payload",
> };
>
> printf(" size: %" PRIu64 "\n", list[i].size);
> diff --git a/nbd/trace-events b/nbd/trace-events
> index c20df33a431..da92fe1b56b 100644
> --- a/nbd/trace-events
> +++ b/nbd/trace-events
> @@ -70,6 +70,7 @@ nbd_co_send_structured_read(uint64_t handle, uint64_t offset, void *data, size_t
> nbd_co_send_structured_read_hole(uint64_t handle, uint64_t offset, size_t size) "Send structured read hole reply: handle = %" PRIu64 ", offset = %" PRIu64 ", len = %zu"
> nbd_co_send_extents(uint64_t handle, unsigned int extents, uint32_t id, uint64_t length, int last) "Send block status reply: handle = %" PRIu64 ", extents = %u, context = %d (extents cover %" PRIu64 " bytes, last chunk = %d)"
> nbd_co_send_structured_error(uint64_t handle, int err, const char *errname, const char *msg) "Send structured error reply: handle = %" PRIu64 ", error = %d (%s), msg = '%s'"
> +nbd_co_receive_block_status_payload_compliance(uint64_t from, int len) "client sent unusable block status payload: from=0x%" PRIx64 ", len=0x%x"
> nbd_co_receive_request_decode_type(uint64_t handle, uint16_t type, const char *name) "Decoding type: handle = %" PRIu64 ", type = %" PRIu16 " (%s)"
> nbd_co_receive_request_payload_received(uint64_t handle, uint64_t len) "Payload received: handle = %" PRIu64 ", len = %" PRIu64
> nbd_co_receive_ext_payload_compliance(uint64_t from, uint64_t len) "client sent non-compliant write without payload flag: from=0x%" PRIx64 ", len=0x%" PRIx64
> diff --git a/tests/qemu-iotests/223.out b/tests/qemu-iotests/223.out
> index b98582c38ea..b38f0b7963b 100644
> --- a/tests/qemu-iotests/223.out
> +++ b/tests/qemu-iotests/223.out
> @@ -83,7 +83,7 @@ exports available: 0
> exports available: 3
> export: 'n'
> size: 4194304
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> @@ -94,7 +94,7 @@ exports available: 3
> export: 'n2'
> description: some text
> size: 4194304
> - flags: 0xded ( flush fua trim zeroes df multi cache fast-zero )
> + flags: 0x1ded ( flush fua trim zeroes df multi cache fast-zero block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> @@ -104,7 +104,7 @@ exports available: 3
> qemu:dirty-bitmap:b2
> export: 'n3'
> size: 4194304
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> @@ -205,7 +205,7 @@ exports available: 0
> exports available: 3
> export: 'n'
> size: 4194304
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> @@ -216,7 +216,7 @@ exports available: 3
> export: 'n2'
> description: some text
> size: 4194304
> - flags: 0xded ( flush fua trim zeroes df multi cache fast-zero )
> + flags: 0x1ded ( flush fua trim zeroes df multi cache fast-zero block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> @@ -226,7 +226,7 @@ exports available: 3
> qemu:dirty-bitmap:b2
> export: 'n3'
> size: 4194304
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
> diff --git a/tests/qemu-iotests/307.out b/tests/qemu-iotests/307.out
> index 2b9a6a67a1a..f645f3315f8 100644
> --- a/tests/qemu-iotests/307.out
> +++ b/tests/qemu-iotests/307.out
> @@ -15,7 +15,7 @@ wrote 4096/4096 bytes at offset 0
> exports available: 1
> export: 'fmt'
> size: 67108864
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: XXX
> opt block: XXX
> max block: XXX
> @@ -44,7 +44,7 @@ exports available: 1
> exports available: 1
> export: 'fmt'
> size: 67108864
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: XXX
> opt block: XXX
> max block: XXX
> @@ -76,7 +76,7 @@ exports available: 1
> exports available: 2
> export: 'fmt'
> size: 67108864
> - flags: 0x58f ( readonly flush fua df multi cache )
> + flags: 0x158f ( readonly flush fua df multi cache block-status-payload )
> min block: XXX
> opt block: XXX
> max block: XXX
> @@ -86,7 +86,7 @@ exports available: 2
> export: 'export1'
> description: This is the writable second export
> size: 67108864
> - flags: 0xded ( flush fua trim zeroes df multi cache fast-zero )
> + flags: 0x1ded ( flush fua trim zeroes df multi cache fast-zero block-status-payload )
> min block: XXX
> opt block: XXX
> max block: XXX
> @@ -113,7 +113,7 @@ exports available: 1
> export: 'export1'
> description: This is the writable second export
> size: 67108864
> - flags: 0xded ( flush fua trim zeroes df multi cache fast-zero )
> + flags: 0x1ded ( flush fua trim zeroes df multi cache fast-zero block-status-payload )
> min block: XXX
> opt block: XXX
> max block: XXX
> diff --git a/tests/qemu-iotests/tests/nbd-qemu-allocation.out b/tests/qemu-iotests/tests/nbd-qemu-allocation.out
> index 659276032b0..794d1bfce62 100644
> --- a/tests/qemu-iotests/tests/nbd-qemu-allocation.out
> +++ b/tests/qemu-iotests/tests/nbd-qemu-allocation.out
> @@ -17,7 +17,7 @@ wrote 2097152/2097152 bytes at offset 1048576
> exports available: 1
> export: ''
> size: 4194304
> - flags: 0x48f ( readonly flush fua df cache )
> + flags: 0x148f ( readonly flush fua df cache block-status-payload )
> min block: 1
> opt block: 4096
> max block: 33554432
--
Best regards,
Vladimir
More information about the Libguestfs
mailing list