Message ID | 20200813200243.3036772-3-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: Implement an IMPDEF pauth algorithm | expand |
On Thu, Aug 13, 2020 at 01:02:42PM -0700, Richard Henderson wrote: > Without hardware acceleration, a cryptographically strong > algorithm is too expensive for pauth_computepac. > > Even with hardware accel, we are not currently expecting > to link the linux-user binaries to any crypto libraries, > and doing so would generally make the --static build fail. > > So choose XXH64 as a reasonably quick and decent hash. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > v2: Move the XXH64 bits to xxhash.h (ajb). > Create isar_feature_aa64_pauth_arch and fixup a comment > in isar_feature_aa64_pauth that no longer applies. > --- > include/qemu/xxhash.h | 82 +++++++++++++++++++++++++++++++++++++++ > target/arm/cpu.h | 15 +++++-- > target/arm/pauth_helper.c | 41 +++++++++++++++++--- > 3 files changed, 129 insertions(+), 9 deletions(-) > > diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h > index 076f1f6054..93ba1a0425 100644 > --- a/include/qemu/xxhash.h > +++ b/include/qemu/xxhash.h > @@ -119,4 +119,86 @@ static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e, > return qemu_xxhash7(ab, cd, e, f, 0); > } > > +/* > + * Component parts of the XXH64 algorithm from > + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h > + * > + * The complete algorithm looks like > + * > + * i = 0; > + * if (len >= 32) { > + * v1 = seed + PRIME64_1 + PRIME64_2; > + * v2 = seed + PRIME64_2; > + * v3 = seed + 0; > + * v4 = seed - XXH_PRIME64_1; > + * do { > + * v1 = XXH64_round(v1, get64bits(input + i)); > + * v2 = XXH64_round(v2, get64bits(input + i + 8)); > + * v3 = XXH64_round(v3, get64bits(input + i + 16)); > + * v4 = XXH64_round(v4, get64bits(input + i + 24)); > + * } while ((i += 32) <= len); > + * h64 = XXH64_mergerounds(v1, v2, v3, v4); > + * } else { > + * h64 = seed + PRIME64_5; > + * } > + * h64 += len; > + * > + * for (; i + 8 <= len; i += 8) { > + * h64 ^= XXH64_round(0, get64bits(input + i)); > + * h64 = rol64(h64, 27) * PRIME64_1 + PRIME64_4; > + * } > + * for (; i + 4 <= len; i += 4) { > + * h64 ^= get32bits(input + i) * PRIME64_1; > + * h64 = rol64(h64, 23) * PRIME64_2 + PRIME64_3; > + * } > + * for (; i < len; i += 1) { > + * h64 ^= get8bits(input + i) * PRIME64_5; > + * h64 = rol64(h64, 11) * PRIME64_1; > + * } > + * > + * return XXH64_avalanche(h64) > + * > + * Exposing the pieces instead allows for simplified usage when > + * the length is a known constant and the inputs are in registers. > + */ > +#define PRIME64_1 0x9E3779B185EBCA87ULL > +#define PRIME64_2 0xC2B2AE3D27D4EB4FULL > +#define PRIME64_3 0x165667B19E3779F9ULL > +#define PRIME64_4 0x85EBCA77C2B2AE63ULL > +#define PRIME64_5 0x27D4EB2F165667C5ULL > + > +static inline uint64_t XXH64_round(uint64_t acc, uint64_t input) > +{ > + return rol64(acc + input * PRIME64_2, 31) * PRIME64_1; > +} > + > +static inline uint64_t XXH64_mergeround(uint64_t acc, uint64_t val) > +{ > + return (acc ^ XXH64_round(0, val)) * PRIME64_1 + PRIME64_4; > +} > + > +static inline uint64_t XXH64_mergerounds(uint64_t v1, uint64_t v2, > + uint64_t v3, uint64_t v4) > +{ > + uint64_t h64; > + > + h64 = rol64(v1, 1) + rol64(v2, 7) + rol64(v3, 12) + rol64(v4, 18); > + h64 = XXH64_mergeround(h64, v1); > + h64 = XXH64_mergeround(h64, v2); > + h64 = XXH64_mergeround(h64, v3); > + h64 = XXH64_mergeround(h64, v4); > + > + return h64; > +} > + > +static inline uint64_t XXH64_avalanche(uint64_t h64) > +{ > + h64 ^= h64 >> 33; > + h64 *= PRIME64_2; > + h64 ^= h64 >> 29; > + h64 *= PRIME64_3; > + h64 ^= h64 >> 32; > + return h64; > +} > + > #endif /* QEMU_XXHASH_H */ > diff --git a/target/arm/cpu.h b/target/arm/cpu.h > index 44901923c8..776bf30cbc 100644 > --- a/target/arm/cpu.h > +++ b/target/arm/cpu.h > @@ -3767,10 +3767,8 @@ static inline bool isar_feature_aa64_fcma(const ARMISARegisters *id) > static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id) > { > /* > - * Note that while QEMU will only implement the architected algorithm > - * QARMA, and thus APA+GPA, the host cpu for kvm may use implementation > - * defined algorithms, and thus API+GPI, and this predicate controls > - * migration of the 128-bit keys. > + * Return true if any form of pauth is enabled, as this > + * predicate controls migration of the 128-bit keys. > */ > return (id->id_aa64isar1 & > (FIELD_DP64(0, ID_AA64ISAR1, APA, 0xf) | > @@ -3779,6 +3777,15 @@ static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id) > FIELD_DP64(0, ID_AA64ISAR1, GPI, 0xf))) != 0; > } > > +static inline bool isar_feature_aa64_pauth_arch(const ARMISARegisters *id) > +{ > + /* > + * Return true if pauth is enabled with the architected QARMA algorithm. > + * QEMU will always set APA+GPA to the same value. > + */ > + return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, APA) != 0; > +} > + > static inline bool isar_feature_aa64_sb(const ARMISARegisters *id) > { > return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, SB) != 0; > diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c > index 6dbab03768..6ec4f83ff0 100644 > --- a/target/arm/pauth_helper.c > +++ b/target/arm/pauth_helper.c > @@ -24,6 +24,7 @@ > #include "exec/cpu_ldst.h" > #include "exec/helper-proto.h" > #include "tcg/tcg-gvec-desc.h" > +#include "qemu/xxhash.h" > > > static uint64_t pac_cell_shuffle(uint64_t i) > @@ -207,8 +208,8 @@ static uint64_t tweak_inv_shuffle(uint64_t i) > return o; > } > > -static uint64_t pauth_computepac(uint64_t data, uint64_t modifier, > - ARMPACKey key) > +static uint64_t __attribute__((noinline)) > +pauth_computepac_architected(uint64_t data, uint64_t modifier, ARMPACKey key) > { > static const uint64_t RC[5] = { > 0x0000000000000000ull, > @@ -272,6 +273,36 @@ static uint64_t pauth_computepac(uint64_t data, uint64_t modifier, > return workingval; > } > > +/* > + * The XXH64 algorithm from > + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h > + */ > +static uint64_t __attribute__((noinline)) > +pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key) Out of curiosity, why do we need to make these computepac functions noinline? > +{ > + uint64_t v1 = QEMU_XXHASH_SEED + PRIME64_1 + PRIME64_2; > + uint64_t v2 = QEMU_XXHASH_SEED + PRIME64_2; > + uint64_t v3 = QEMU_XXHASH_SEED + 0; > + uint64_t v4 = QEMU_XXHASH_SEED - PRIME64_1; > + > + v1 = XXH64_round(v1, data); > + v2 = XXH64_round(v2, modifier); > + v3 = XXH64_round(v3, key.lo); > + v4 = XXH64_round(v4, key.hi); > + > + return XXH64_avalanche(XXH64_mergerounds(v1, v2, v3, v4)); > +} > + > +static uint64_t pauth_computepac(CPUARMState *env, uint64_t data, > + uint64_t modifier, ARMPACKey key) > +{ > + if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) { > + return pauth_computepac_architected(data, modifier, key); > + } else { > + return pauth_computepac_impdef(data, modifier, key); > + } > +} I think this patch should come before the last one. As it stands, when bisecting between the last one and this one a user could attempt to enable pauth-imdef, but it wouldn't do anything, or it would potentially break things. However, this patch shouldn't change anything if it comes first. > + > static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, > ARMPACKey *key, bool data) > { > @@ -292,7 +323,7 @@ static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, > bot_bit = 64 - param.tsz; > ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext); > > - pac = pauth_computepac(ext_ptr, modifier, *key); > + pac = pauth_computepac(env, ext_ptr, modifier, *key); > > /* > * Check if the ptr has good extension bits and corrupt the > @@ -341,7 +372,7 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, > uint64_t pac, orig_ptr, test; > > orig_ptr = pauth_original_ptr(ptr, param); > - pac = pauth_computepac(orig_ptr, modifier, *key); > + pac = pauth_computepac(env, orig_ptr, modifier, *key); > bot_bit = 64 - param.tsz; > top_bit = 64 - 8 * param.tbi; > > @@ -442,7 +473,7 @@ uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y) > uint64_t pac; > > pauth_check_trap(env, arm_current_el(env), GETPC()); > - pac = pauth_computepac(x, y, env->keys.apga); > + pac = pauth_computepac(env, x, y, env->keys.apga); > > return pac & 0xffffffff00000000ull; > } > -- > 2.25.1 > > Thanks, drew
On 8/14/20 2:26 AM, Andrew Jones wrote: >> +static uint64_t __attribute__((noinline)) >> +pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key) > > Out of curiosity, why do we need to make these computepac functions > noinline? Oh, heh. Left over from profiling. Will remove. > I think this patch should come before the last one. As it stands, when > bisecting between the last one and this one a user could attempt to > enable pauth-imdef, but it wouldn't do anything, or it would potentially > break things. However, this patch shouldn't change anything if it comes > first. The current patch ordering would enable impdef but implement that with the architected algorithm. Which is ok. But you're right that the other ordering makes more sense. r~
diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h index 076f1f6054..93ba1a0425 100644 --- a/include/qemu/xxhash.h +++ b/include/qemu/xxhash.h @@ -119,4 +119,86 @@ static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e, return qemu_xxhash7(ab, cd, e, f, 0); } +/* + * Component parts of the XXH64 algorithm from + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h + * + * The complete algorithm looks like + * + * i = 0; + * if (len >= 32) { + * v1 = seed + PRIME64_1 + PRIME64_2; + * v2 = seed + PRIME64_2; + * v3 = seed + 0; + * v4 = seed - XXH_PRIME64_1; + * do { + * v1 = XXH64_round(v1, get64bits(input + i)); + * v2 = XXH64_round(v2, get64bits(input + i + 8)); + * v3 = XXH64_round(v3, get64bits(input + i + 16)); + * v4 = XXH64_round(v4, get64bits(input + i + 24)); + * } while ((i += 32) <= len); + * h64 = XXH64_mergerounds(v1, v2, v3, v4); + * } else { + * h64 = seed + PRIME64_5; + * } + * h64 += len; + * + * for (; i + 8 <= len; i += 8) { + * h64 ^= XXH64_round(0, get64bits(input + i)); + * h64 = rol64(h64, 27) * PRIME64_1 + PRIME64_4; + * } + * for (; i + 4 <= len; i += 4) { + * h64 ^= get32bits(input + i) * PRIME64_1; + * h64 = rol64(h64, 23) * PRIME64_2 + PRIME64_3; + * } + * for (; i < len; i += 1) { + * h64 ^= get8bits(input + i) * PRIME64_5; + * h64 = rol64(h64, 11) * PRIME64_1; + * } + * + * return XXH64_avalanche(h64) + * + * Exposing the pieces instead allows for simplified usage when + * the length is a known constant and the inputs are in registers. + */ +#define PRIME64_1 0x9E3779B185EBCA87ULL +#define PRIME64_2 0xC2B2AE3D27D4EB4FULL +#define PRIME64_3 0x165667B19E3779F9ULL +#define PRIME64_4 0x85EBCA77C2B2AE63ULL +#define PRIME64_5 0x27D4EB2F165667C5ULL + +static inline uint64_t XXH64_round(uint64_t acc, uint64_t input) +{ + return rol64(acc + input * PRIME64_2, 31) * PRIME64_1; +} + +static inline uint64_t XXH64_mergeround(uint64_t acc, uint64_t val) +{ + return (acc ^ XXH64_round(0, val)) * PRIME64_1 + PRIME64_4; +} + +static inline uint64_t XXH64_mergerounds(uint64_t v1, uint64_t v2, + uint64_t v3, uint64_t v4) +{ + uint64_t h64; + + h64 = rol64(v1, 1) + rol64(v2, 7) + rol64(v3, 12) + rol64(v4, 18); + h64 = XXH64_mergeround(h64, v1); + h64 = XXH64_mergeround(h64, v2); + h64 = XXH64_mergeround(h64, v3); + h64 = XXH64_mergeround(h64, v4); + + return h64; +} + +static inline uint64_t XXH64_avalanche(uint64_t h64) +{ + h64 ^= h64 >> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + #endif /* QEMU_XXHASH_H */ diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 44901923c8..776bf30cbc 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -3767,10 +3767,8 @@ static inline bool isar_feature_aa64_fcma(const ARMISARegisters *id) static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id) { /* - * Note that while QEMU will only implement the architected algorithm - * QARMA, and thus APA+GPA, the host cpu for kvm may use implementation - * defined algorithms, and thus API+GPI, and this predicate controls - * migration of the 128-bit keys. + * Return true if any form of pauth is enabled, as this + * predicate controls migration of the 128-bit keys. */ return (id->id_aa64isar1 & (FIELD_DP64(0, ID_AA64ISAR1, APA, 0xf) | @@ -3779,6 +3777,15 @@ static inline bool isar_feature_aa64_pauth(const ARMISARegisters *id) FIELD_DP64(0, ID_AA64ISAR1, GPI, 0xf))) != 0; } +static inline bool isar_feature_aa64_pauth_arch(const ARMISARegisters *id) +{ + /* + * Return true if pauth is enabled with the architected QARMA algorithm. + * QEMU will always set APA+GPA to the same value. + */ + return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, APA) != 0; +} + static inline bool isar_feature_aa64_sb(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, SB) != 0; diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c index 6dbab03768..6ec4f83ff0 100644 --- a/target/arm/pauth_helper.c +++ b/target/arm/pauth_helper.c @@ -24,6 +24,7 @@ #include "exec/cpu_ldst.h" #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" +#include "qemu/xxhash.h" static uint64_t pac_cell_shuffle(uint64_t i) @@ -207,8 +208,8 @@ static uint64_t tweak_inv_shuffle(uint64_t i) return o; } -static uint64_t pauth_computepac(uint64_t data, uint64_t modifier, - ARMPACKey key) +static uint64_t __attribute__((noinline)) +pauth_computepac_architected(uint64_t data, uint64_t modifier, ARMPACKey key) { static const uint64_t RC[5] = { 0x0000000000000000ull, @@ -272,6 +273,36 @@ static uint64_t pauth_computepac(uint64_t data, uint64_t modifier, return workingval; } +/* + * The XXH64 algorithm from + * https://github.com/Cyan4973/xxHash/blob/v0.8.0/xxhash.h + */ +static uint64_t __attribute__((noinline)) +pauth_computepac_impdef(uint64_t data, uint64_t modifier, ARMPACKey key) +{ + uint64_t v1 = QEMU_XXHASH_SEED + PRIME64_1 + PRIME64_2; + uint64_t v2 = QEMU_XXHASH_SEED + PRIME64_2; + uint64_t v3 = QEMU_XXHASH_SEED + 0; + uint64_t v4 = QEMU_XXHASH_SEED - PRIME64_1; + + v1 = XXH64_round(v1, data); + v2 = XXH64_round(v2, modifier); + v3 = XXH64_round(v3, key.lo); + v4 = XXH64_round(v4, key.hi); + + return XXH64_avalanche(XXH64_mergerounds(v1, v2, v3, v4)); +} + +static uint64_t pauth_computepac(CPUARMState *env, uint64_t data, + uint64_t modifier, ARMPACKey key) +{ + if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) { + return pauth_computepac_architected(data, modifier, key); + } else { + return pauth_computepac_impdef(data, modifier, key); + } +} + static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, ARMPACKey *key, bool data) { @@ -292,7 +323,7 @@ static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier, bot_bit = 64 - param.tsz; ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext); - pac = pauth_computepac(ext_ptr, modifier, *key); + pac = pauth_computepac(env, ext_ptr, modifier, *key); /* * Check if the ptr has good extension bits and corrupt the @@ -341,7 +372,7 @@ static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier, uint64_t pac, orig_ptr, test; orig_ptr = pauth_original_ptr(ptr, param); - pac = pauth_computepac(orig_ptr, modifier, *key); + pac = pauth_computepac(env, orig_ptr, modifier, *key); bot_bit = 64 - param.tsz; top_bit = 64 - 8 * param.tbi; @@ -442,7 +473,7 @@ uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y) uint64_t pac; pauth_check_trap(env, arm_current_el(env), GETPC()); - pac = pauth_computepac(x, y, env->keys.apga); + pac = pauth_computepac(env, x, y, env->keys.apga); return pac & 0xffffffff00000000ull; }
Without hardware acceleration, a cryptographically strong algorithm is too expensive for pauth_computepac. Even with hardware accel, we are not currently expecting to link the linux-user binaries to any crypto libraries, and doing so would generally make the --static build fail. So choose XXH64 as a reasonably quick and decent hash. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- v2: Move the XXH64 bits to xxhash.h (ajb). Create isar_feature_aa64_pauth_arch and fixup a comment in isar_feature_aa64_pauth that no longer applies. --- include/qemu/xxhash.h | 82 +++++++++++++++++++++++++++++++++++++++ target/arm/cpu.h | 15 +++++-- target/arm/pauth_helper.c | 41 +++++++++++++++++--- 3 files changed, 129 insertions(+), 9 deletions(-) -- 2.25.1