[dm-devel] [PATCH v3] crypto: xts - Drop use of auxiliary buffer
Eric Biggers
ebiggers at kernel.org
Sat Sep 8 01:35:06 UTC 2018
Hi Ondrej,
On Wed, Sep 05, 2018 at 01:30:39PM +0200, Ondrej Mosnacek wrote:
> Since commit acb9b159c784 ("crypto: gf128mul - define gf128mul_x_* in
> gf128mul.h"), the gf128mul_x_*() functions are very fast and therefore
> caching the computed XTS tweaks has only negligible advantage over
> computing them twice.
>
> In fact, since the current caching implementation limits the size of
> the calls to the child ecb(...) algorithm to PAGE_SIZE (usually 4096 B),
> it is often actually slower than the simple recomputing implementation.
>
> This patch simplifies the XTS template to recompute the XTS tweaks from
> scratch in the second pass and thus also removes the need to allocate a
> dynamic buffer using kmalloc().
>
> As discussed at [1], the use of kmalloc causes deadlocks with dm-crypt.
>
> PERFORMANCE RESULTS
> I measured time to encrypt/decrypt a memory buffer of varying sizes with
> xts(ecb-aes-aesni) using a tool I wrote ([2]) and the results suggest
> that after this patch the performance is either better or comparable for
> both small and large buffers. Note that there is a lot of noise in the
> measurements, but the overall difference is easy to see.
>
> Old code:
> ALGORITHM KEY (b) DATA (B) TIME ENC (ns) TIME DEC (ns)
> xts(aes) 256 64 331 328
> xts(aes) 384 64 332 333
> xts(aes) 512 64 338 348
> xts(aes) 256 512 889 920
> xts(aes) 384 512 1019 993
> xts(aes) 512 512 1032 990
> xts(aes) 256 4096 2152 2292
> xts(aes) 384 4096 2453 2597
> xts(aes) 512 4096 3041 2641
> xts(aes) 256 16384 9443 8027
> xts(aes) 384 16384 8536 8925
> xts(aes) 512 16384 9232 9417
> xts(aes) 256 32768 16383 14897
> xts(aes) 384 32768 17527 16102
> xts(aes) 512 32768 18483 17322
>
> New code:
> ALGORITHM KEY (b) DATA (B) TIME ENC (ns) TIME DEC (ns)
> xts(aes) 256 64 328 324
> xts(aes) 384 64 324 319
> xts(aes) 512 64 320 322
> xts(aes) 256 512 476 473
> xts(aes) 384 512 509 492
> xts(aes) 512 512 531 514
> xts(aes) 256 4096 2132 1829
> xts(aes) 384 4096 2357 2055
> xts(aes) 512 4096 2178 2027
> xts(aes) 256 16384 6920 6983
> xts(aes) 384 16384 8597 7505
> xts(aes) 512 16384 7841 8164
> xts(aes) 256 32768 13468 12307
> xts(aes) 384 32768 14808 13402
> xts(aes) 512 32768 15753 14636
>
> [1] https://lkml.org/lkml/2018/8/23/1315
> [2] https://gitlab.com/omos/linux-crypto-bench
>
> Signed-off-by: Ondrej Mosnacek <omosnace at redhat.com>
> ---
> crypto/xts.c | 265 ++++++++-------------------------------------------
> 1 file changed, 39 insertions(+), 226 deletions(-)
>
> Changes in v3:
> - add comment explaining the new approach as suggested by Eric
> - ensure correct alignment in second xor_tweak() pass
> - align performance results table header to the right
>
> v2: https://www.spinics.net/lists/linux-crypto/msg34799.html
> Changes in v2:
> - rebase to latest cryptodev tree
>
> v1: https://www.spinics.net/lists/linux-crypto/msg34776.html
>
> diff --git a/crypto/xts.c b/crypto/xts.c
> index ccf55fbb8bc2..24cfecdec565 100644
> --- a/crypto/xts.c
> +++ b/crypto/xts.c
> @@ -26,8 +26,6 @@
> #include <crypto/b128ops.h>
> #include <crypto/gf128mul.h>
>
> -#define XTS_BUFFER_SIZE 128u
> -
> struct priv {
> struct crypto_skcipher *child;
> struct crypto_cipher *tweak;
> @@ -39,19 +37,7 @@ struct xts_instance_ctx {
> };
>
> struct rctx {
> - le128 buf[XTS_BUFFER_SIZE / sizeof(le128)];
> -
> le128 t;
> -
> - le128 *ext;
> -
> - struct scatterlist srcbuf[2];
> - struct scatterlist dstbuf[2];
> - struct scatterlist *src;
> - struct scatterlist *dst;
> -
> - unsigned int left;
> -
> struct skcipher_request subreq;
> };
>
> @@ -96,265 +82,92 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key,
> return err;
> }
>
> -static int post_crypt(struct skcipher_request *req)
> +/*
> + * We compute the tweak masks twice (both before and after the ECB encryption or
> + * decryption) to avoid having to allocate a temporary buffer and/or make
> + * mutliple calls to the 'ecb(..)' instance, which usually would be slower than
> + * just doing the gf128mul_x_ble() calls again.
> + */
> +static int xor_tweak(struct skcipher_request *req, struct skcipher_request *subreq)
> {
> struct rctx *rctx = skcipher_request_ctx(req);
> - le128 *buf = rctx->ext ?: rctx->buf;
> - struct skcipher_request *subreq;
> + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> const int bs = XTS_BLOCK_SIZE;
> struct skcipher_walk w;
> - struct scatterlist *sg;
> - unsigned offset;
> + le128 t = rctx->t;
> int err;
>
> - subreq = &rctx->subreq;
> + /* set to our TFM to enforce correct alignment: */
> + skcipher_request_set_tfm(subreq, tfm);
> +
> err = skcipher_walk_virt(&w, subreq, false);
>
Hmm, it confused me how 'subreq' isn't necessarily the same as 'rctx->subreq'.
Also skcipher_request_set_tfm() is called even on the original 'req'. I suppose
it ends up setting it to the previous value and therefore is safe, but I'm not
completely sure; do any other algorithms do that? I don't think it's a good
idea in general to modify the request besides the request_ctx() portion.
Actually all the information is available from the original 'req' anyway, so why
not just pass a bool that indicates whether it's the first or second XOR pass?
Like the following incremental patch:
diff --git a/crypto/xts.c b/crypto/xts.c
index 24cfecdec5656..0df868aa0ae7f 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -88,7 +88,7 @@ static int setkey(struct crypto_skcipher *parent, const u8 *key,
* mutliple calls to the 'ecb(..)' instance, which usually would be slower than
* just doing the gf128mul_x_ble() calls again.
*/
-static int xor_tweak(struct skcipher_request *req, struct skcipher_request *subreq)
+static int xor_tweak(struct skcipher_request *req, bool second_pass)
{
struct rctx *rctx = skcipher_request_ctx(req);
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -97,10 +97,12 @@ static int xor_tweak(struct skcipher_request *req, struct skcipher_request *subr
le128 t = rctx->t;
int err;
- /* set to our TFM to enforce correct alignment: */
- skcipher_request_set_tfm(subreq, tfm);
-
- err = skcipher_walk_virt(&w, subreq, false);
+ if (second_pass) {
+ req = &rctx->subreq;
+ /* set to our TFM to enforce correct alignment: */
+ skcipher_request_set_tfm(req, tfm);
+ }
+ err = skcipher_walk_virt(&w, req, false);
while (w.nbytes) {
unsigned int avail = w.nbytes;
@@ -124,11 +126,9 @@ static int xor_tweak(struct skcipher_request *req, struct skcipher_request *subr
static void crypt_done(struct crypto_async_request *areq, int err)
{
struct skcipher_request *req = areq->data;
- struct rctx *rctx = skcipher_request_ctx(req);
- struct skcipher_request *subreq = &rctx->subreq;
if (!err)
- err = xor_tweak(req, subreq);
+ err = xor_tweak(req, true);
skcipher_request_complete(req, err);
}
@@ -154,9 +154,9 @@ static int encrypt(struct skcipher_request *req)
struct skcipher_request *subreq = &rctx->subreq;
init_crypt(req);
- return xor_tweak(req, req) ?:
+ return xor_tweak(req, false) ?:
crypto_skcipher_encrypt(subreq) ?:
- xor_tweak(req, subreq);
+ xor_tweak(req, true);
}
static int decrypt(struct skcipher_request *req)
@@ -165,9 +165,9 @@ static int decrypt(struct skcipher_request *req)
struct skcipher_request *subreq = &rctx->subreq;
init_crypt(req);
- return xor_tweak(req, req) ?:
+ return xor_tweak(req, false) ?:
crypto_skcipher_decrypt(subreq) ?:
- xor_tweak(req, subreq);
+ xor_tweak(req, true);
}
static int init_tfm(struct crypto_skcipher *tfm)
More information about the dm-devel
mailing list