(cc Eric) On Thu, 17 Apr 2025 at 08:49, Qingfang Deng <dqfext@xxxxxxxxx> wrote: > > From: Qingfang Deng <qingfang.deng@xxxxxxxxxxxxxxx> > > Add a scalar implementation of GHASH for RISC-V using the Zbc (carry-less > multiplication) and Zbb (bit-manipulation) extensions. This implementation > is adapted from OpenSSL but rewritten in plain C for clarity. > > Unlike the OpenSSL one that rely on bit-reflection of the data, this > version uses a pre-computed (reflected and multiplied) key, inspired by > the approach used in Intel's CLMUL driver, to avoid reflections during > runtime. > > Signed-off-by: Qingfang Deng <qingfang.deng@xxxxxxxxxxxxxxx> What is the use case for this? AIUI, the scalar AES instructions were never implemented by anyone, so how do you expect this to be used in practice? > --- > arch/riscv/crypto/Kconfig | 16 +- > arch/riscv/crypto/Makefile | 2 + > arch/riscv/crypto/ghash-riscv64-clmul.c | 270 ++++++++++++++++++++++++ > 3 files changed, 287 insertions(+), 1 deletion(-) > create mode 100644 arch/riscv/crypto/ghash-riscv64-clmul.c > > diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig > index 6392e1e11bc9..03b74d4116cb 100644 > --- a/arch/riscv/crypto/Kconfig > +++ b/arch/riscv/crypto/Kconfig > @@ -26,7 +26,7 @@ config CRYPTO_CHACHA_RISCV64 > default CRYPTO_LIB_CHACHA_INTERNAL > > config CRYPTO_GHASH_RISCV64 > - tristate "Hash functions: GHASH" > + tristate "Hash functions: GHASH (vector accelarated)" > depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO > select CRYPTO_GCM > help > @@ -35,6 +35,20 @@ config CRYPTO_GHASH_RISCV64 > Architecture: riscv64 using: > - Zvkg vector crypto extension > > +config CRYPTO_GHASH_RISCV64_CLMUL > + tristate "Hash functions: GHASH (CLMUL scalar accelerated)" > + depends on 64BIT && TOOLCHAIN_HAS_ZBB && TOOLCHAIN_HAS_ZBC > + select CRYPTO_GCM > + help > + GCM GHASH function (NIST SP 800-38D) > + > + Architecture: riscv64 using: > + - Zbb Bitmanipulation extension > + - Zbc Carry-less multiplication > + OR > + - Zbkb Bit-manipulation for Cryptography > + - Zbkc Carry-less multiplication for Cryptography > + > config CRYPTO_SHA256_RISCV64 > tristate "Hash functions: SHA-224 and SHA-256" > depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO > diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile > index 247c7bc7288c..b5dc497d398c 100644 > --- a/arch/riscv/crypto/Makefile > +++ b/arch/riscv/crypto/Makefile > @@ -10,6 +10,8 @@ chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o > obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o > ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o > > +obj-$(CONFIG_CRYPTO_GHASH_RISCV64_CLMUL) += ghash-riscv64-clmul.o > + > obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o > sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o > > diff --git a/arch/riscv/crypto/ghash-riscv64-clmul.c b/arch/riscv/crypto/ghash-riscv64-clmul.c > new file mode 100644 > index 000000000000..4777aa8e94cb > --- /dev/null > +++ b/arch/riscv/crypto/ghash-riscv64-clmul.c > @@ -0,0 +1,270 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * GHASH using the RISC-V Zbc/Zbkc (CLMUL) extension > + * > + * Copyright (C) 2023 VRULL GmbH > + * Author: Christoph Müllner <christoph.muellner@xxxxxxxx> > + * > + * Copyright (C) 2025 Siflower Communications Ltd > + * Author: Qingfang Deng <qingfang.deng@xxxxxxxxxxxxxxx> > + */ > + > +#include <linux/crypto.h> > +#include <linux/err.h> > +#include <linux/module.h> > +#include <linux/types.h> > +#include <crypto/ghash.h> > +#include <crypto/internal/hash.h> > + > +#define GHASH_MOD_POLY 0xc200000000000000 > + > +struct riscv64_clmul_ghash_ctx { > + __uint128_t key; > +}; > + > +struct riscv64_clmul_ghash_desc_ctx { > + __uint128_t shash; > + u8 buffer[GHASH_DIGEST_SIZE]; > + int bytes; > +}; > + > +static __always_inline u64 riscv_zbb_swab64(u64 val) > +{ > + asm (".option push\n" > + ".option arch,+zbb\n" > + "rev8 %0, %1\n" > + ".option pop\n" > + : "=r" (val) : "r" (val)); > + return val; > +} > + > +static __always_inline __uint128_t get_unaligned_be128(const u8 *p) > +{ > + __uint128_t val; > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means that get_unaligned_xxx() helpers are cheap. Casting a void* to an aligned type is still UB as per the C standard. So better to drop the #ifdef entirely, and just use the get_unaligned_be64() helpers for both cases. (same below) Also, do you need to test for int128 support? Or is that guaranteed for all compilers that are supported by the RISC-V port? > + val = *(__uint128_t *)p; > + val = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64; > +#else > + val = (__uint128_t)p[0] << 120; > + val |= (__uint128_t)p[1] << 112; > + val |= (__uint128_t)p[2] << 104; > + val |= (__uint128_t)p[3] << 96; > + val |= (__uint128_t)p[4] << 88; > + val |= (__uint128_t)p[5] << 80; > + val |= (__uint128_t)p[6] << 72; > + val |= (__uint128_t)p[7] << 64; > + val |= (__uint128_t)p[8] << 56; > + val |= (__uint128_t)p[9] << 48; > + val |= (__uint128_t)p[10] << 40; > + val |= (__uint128_t)p[11] << 32; > + val |= (__uint128_t)p[12] << 24; > + val |= (__uint128_t)p[13] << 16; > + val |= (__uint128_t)p[14] << 8; > + val |= (__uint128_t)p[15]; > +#endif > + return val; > +} > + > +static __always_inline void put_unaligned_be128(__uint128_t val, u8 *p) > +{ > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > + *(__uint128_t *)p = riscv_zbb_swab64(val >> 64) | (__uint128_t)riscv_zbb_swab64(val) << 64; > +#else > + p[0] = val >> 120; > + p[1] = val >> 112; > + p[2] = val >> 104; > + p[3] = val >> 96; > + p[4] = val >> 88; > + p[5] = val >> 80; > + p[6] = val >> 72; > + p[7] = val >> 64; > + p[8] = val >> 56; > + p[9] = val >> 48; > + p[10] = val >> 40; > + p[11] = val >> 32; > + p[12] = val >> 24; > + p[13] = val >> 16; > + p[14] = val >> 8; > + p[15] = val; > +#endif > +} > + > +static __always_inline __attribute_const__ > +__uint128_t clmul128(u64 a, u64 b) > +{ > + u64 hi, lo; > + > + asm(".option push\n" > + ".option arch,+zbc\n" > + "clmul %0, %2, %3\n" > + "clmulh %1, %2, %3\n" > + ".option pop\n" > + : "=&r" (lo), "=&r" (hi) : "r" (a), "r" (b)); > + return (__uint128_t)hi << 64 | lo; > +} > + > +static int riscv64_clmul_ghash_init(struct shash_desc *desc) > +{ > + struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + > + dctx->bytes = 0; > + dctx->shash = 0; > + return 0; > +} > + > +/* Compute GMULT (Xi*H mod f) using the Zbc (clmul) extensions. > + * Using the no-Karatsuba approach and clmul for the final reduction. > + * This results in an implementation with minimized number of instructions. > + * HW with clmul latencies higher than 2 cycles might observe a performance > + * improvement with Karatsuba. HW with clmul latencies higher than 6 cycles > + * might observe a performance improvement with additionally converting the > + * reduction to shift&xor. For a full discussion of this estimates see > + * https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc > + */ > +static void gcm_ghash_rv64i_zbc(__uint128_t *Xi, __uint128_t k, const u8 *inp, size_t len) > +{ > + u64 k_hi = k >> 64, k_lo = k, p_hi, p_lo; > + __uint128_t hash = *Xi, p; > + > + do { > + __uint128_t t0, t1, t2, t3, lo, mid, hi; > + > + /* Load the input data, byte-reverse them, and XOR them with Xi */ > + p = get_unaligned_be128(inp); > + > + inp += GHASH_BLOCK_SIZE; > + len -= GHASH_BLOCK_SIZE; > + > + p ^= hash; > + p_hi = p >> 64; > + p_lo = p; > + > + /* Multiplication (without Karatsuba) */ > + t0 = clmul128(p_lo, k_lo); > + t1 = clmul128(p_lo, k_hi); > + t2 = clmul128(p_hi, k_lo); > + t3 = clmul128(p_hi, k_hi); > + mid = t1 ^ t2; > + lo = t0 ^ (mid << 64); > + hi = t3 ^ (mid >> 64); > + > + /* Reduction with clmul */ > + mid = clmul128(lo, GHASH_MOD_POLY); > + lo ^= mid << 64; > + hi ^= lo ^ (mid >> 64); > + hi ^= clmul128(lo >> 64, GHASH_MOD_POLY); > + hash = hi; > + } while (len); > + > + *Xi = hash; > +} > + > +static int riscv64_clmul_ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) > +{ > + struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(tfm); > + __uint128_t k; > + > + if (keylen != GHASH_BLOCK_SIZE) > + return -EINVAL; > + > + k = get_unaligned_be128(key); > + k = (k << 1 | k >> 127) ^ (k >> 127 ? (__uint128_t)GHASH_MOD_POLY << 64 : 0); > + ctx->key = k; > + > + return 0; > +} > + > +static int riscv64_clmul_ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) > +{ > + struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); > + struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + unsigned int len; > + > + if (dctx->bytes) { > + if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) { > + memcpy(dctx->buffer + dctx->bytes, src, srclen); > + dctx->bytes += srclen; > + return 0; > + } > + memcpy(dctx->buffer + dctx->bytes, src, GHASH_DIGEST_SIZE - dctx->bytes); > + > + gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE); > + > + src += GHASH_DIGEST_SIZE - dctx->bytes; > + srclen -= GHASH_DIGEST_SIZE - dctx->bytes; > + dctx->bytes = 0; > + } > + > + len = round_down(srclen, GHASH_BLOCK_SIZE); > + if (len) { > + gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, src, len); > + src += len; > + srclen -= len; > + } > + > + if (srclen) { > + memcpy(dctx->buffer, src, srclen); > + dctx->bytes = srclen; > + } > + return 0; > +} > + > +static int riscv64_clmul_ghash_final(struct shash_desc *desc, u8 out[GHASH_DIGEST_SIZE]) > +{ > + struct riscv64_clmul_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); > + struct riscv64_clmul_ghash_desc_ctx *dctx = shash_desc_ctx(desc); > + int i; > + > + if (dctx->bytes) { > + for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++) > + dctx->buffer[i] = 0; > + gcm_ghash_rv64i_zbc(&dctx->shash, ctx->key, dctx->buffer, GHASH_DIGEST_SIZE); > + dctx->bytes = 0; > + } > + put_unaligned_be128(dctx->shash, out); > + return 0; > +} > + > +struct shash_alg riscv64_clmul_ghash_alg = { > + .init = riscv64_clmul_ghash_init, > + .update = riscv64_clmul_ghash_update, > + .final = riscv64_clmul_ghash_final, > + .setkey = riscv64_clmul_ghash_setkey, > + .descsize = sizeof(struct riscv64_clmul_ghash_desc_ctx), > + .digestsize = GHASH_DIGEST_SIZE, > + .base = { > + .cra_blocksize = GHASH_BLOCK_SIZE, > + .cra_ctxsize = sizeof(struct riscv64_clmul_ghash_ctx), > + .cra_priority = 250, > + .cra_name = "ghash", > + .cra_driver_name = "ghash-riscv64-clmul", > + .cra_module = THIS_MODULE, > + }, > +}; > + > +static int __init riscv64_clmul_ghash_mod_init(void) > +{ > + bool has_clmul, has_rev8; > + > + has_clmul = riscv_isa_extension_available(NULL, ZBC) || > + riscv_isa_extension_available(NULL, ZBKC); > + has_rev8 = riscv_isa_extension_available(NULL, ZBB) || > + riscv_isa_extension_available(NULL, ZBKB); > + if (has_clmul && has_rev8) > + return crypto_register_shash(&riscv64_clmul_ghash_alg); > + > + return -ENODEV; > +} > + > +static void __exit riscv64_clmul_ghash_mod_fini(void) > +{ > + crypto_unregister_shash(&riscv64_clmul_ghash_alg); > +} > + > +module_init(riscv64_clmul_ghash_mod_init); > +module_exit(riscv64_clmul_ghash_mod_fini); > + > +MODULE_DESCRIPTION("GHASH (RISC-V CLMUL accelerated)"); > +MODULE_AUTHOR("Qingfang Deng <dqfext@xxxxxxxxx>"); > +MODULE_LICENSE("GPL"); > +MODULE_ALIAS_CRYPTO("ghash"); > -- > 2.43.0 > >