From: Thomas Pornin Date: Thu, 5 Jan 2017 18:47:00 +0000 (+0100) Subject: Two new Poly1305 implementations: ctmul32 uses pure 32-bit multiplications (MUL15... X-Git-Tag: v0.4~26 X-Git-Url: https://www.bearssl.org/gitweb//home/git/?p=BearSSL;a=commitdiff_plain;h=7fc1ef315f807170f63b0ad8255cf77314b50ca5 Two new Poly1305 implementations: ctmul32 uses pure 32-bit multiplications (MUL15, constant-time on about everything); i15 uses the generic i15 big integers (MUL15 again), which is quite slow but also small. --- diff --git a/Makefile b/Makefile index 32c5d7b..4c627f5 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ OBJMAC = $(BUILD)/hmac.o $(BUILD)/hmac_ct.o OBJRAND = $(BUILD)/hmac_drbg.o OBJRSA = $(BUILD)/rsa_i15_pkcs1_sign.o $(BUILD)/rsa_i15_pkcs1_vrfy.o $(BUILD)/rsa_i15_priv.o $(BUILD)/rsa_i15_pub.o $(BUILD)/rsa_i31_pkcs1_sign.o $(BUILD)/rsa_i31_pkcs1_vrfy.o $(BUILD)/rsa_i31_priv.o $(BUILD)/rsa_i31_pub.o $(BUILD)/rsa_i32_pkcs1_sign.o $(BUILD)/rsa_i32_pkcs1_vrfy.o $(BUILD)/rsa_i32_priv.o $(BUILD)/rsa_i32_pub.o $(BUILD)/rsa_pkcs1_sig_pad.o $(BUILD)/rsa_pkcs1_sig_unpad.o $(BUILD)/rsa_ssl_decrypt.o OBJSSL = $(BUILD)/prf.o $(BUILD)/prf_md5sha1.o $(BUILD)/prf_sha256.o $(BUILD)/prf_sha384.o $(BUILD)/ssl_ccert_single_ec.o $(BUILD)/ssl_ccert_single_rsa.o $(BUILD)/ssl_client.o $(BUILD)/ssl_client_full.o $(BUILD)/ssl_engine.o $(BUILD)/ssl_hashes.o $(BUILD)/ssl_hs_client.o $(BUILD)/ssl_hs_server.o $(BUILD)/ssl_io.o $(BUILD)/ssl_lru.o $(BUILD)/ssl_rec_cbc.o $(BUILD)/ssl_rec_chapol.o $(BUILD)/ssl_rec_gcm.o $(BUILD)/ssl_server.o $(BUILD)/ssl_server_mine2c.o $(BUILD)/ssl_server_mine2g.o $(BUILD)/ssl_server_minf2c.o $(BUILD)/ssl_server_minf2g.o $(BUILD)/ssl_server_minr2g.o $(BUILD)/ssl_server_minu2g.o $(BUILD)/ssl_server_minv2g.o $(BUILD)/ssl_server_full_ec.o $(BUILD)/ssl_server_full_rsa.o $(BUILD)/ssl_scert_single_ec.o $(BUILD)/ssl_scert_single_rsa.o -OBJSYMCIPHER = $(BUILD)/aes_big_cbcdec.o $(BUILD)/aes_big_cbcenc.o $(BUILD)/aes_big_ctr.o $(BUILD)/aes_big_dec.o $(BUILD)/aes_big_enc.o $(BUILD)/aes_common.o $(BUILD)/aes_ct.o $(BUILD)/aes_ct64.o $(BUILD)/aes_ct64_cbcdec.o $(BUILD)/aes_ct64_cbcenc.o $(BUILD)/aes_ct64_ctr.o $(BUILD)/aes_ct64_dec.o $(BUILD)/aes_ct64_enc.o $(BUILD)/aes_ct_cbcdec.o $(BUILD)/aes_ct_cbcenc.o $(BUILD)/aes_ct_ctr.o $(BUILD)/aes_ct_dec.o $(BUILD)/aes_ct_enc.o $(BUILD)/aes_small_cbcdec.o $(BUILD)/aes_small_cbcenc.o $(BUILD)/aes_small_ctr.o $(BUILD)/aes_small_dec.o $(BUILD)/aes_small_enc.o $(BUILD)/chacha20_ct.o $(BUILD)/des_ct.o $(BUILD)/des_ct_cbcdec.o $(BUILD)/des_ct_cbcenc.o $(BUILD)/des_support.o $(BUILD)/des_tab.o $(BUILD)/des_tab_cbcdec.o $(BUILD)/des_tab_cbcenc.o $(BUILD)/poly1305_ctmul.o +OBJSYMCIPHER = $(BUILD)/aes_big_cbcdec.o $(BUILD)/aes_big_cbcenc.o $(BUILD)/aes_big_ctr.o $(BUILD)/aes_big_dec.o $(BUILD)/aes_big_enc.o $(BUILD)/aes_common.o $(BUILD)/aes_ct.o $(BUILD)/aes_ct64.o $(BUILD)/aes_ct64_cbcdec.o $(BUILD)/aes_ct64_cbcenc.o $(BUILD)/aes_ct64_ctr.o $(BUILD)/aes_ct64_dec.o $(BUILD)/aes_ct64_enc.o $(BUILD)/aes_ct_cbcdec.o $(BUILD)/aes_ct_cbcenc.o $(BUILD)/aes_ct_ctr.o $(BUILD)/aes_ct_dec.o $(BUILD)/aes_ct_enc.o $(BUILD)/aes_small_cbcdec.o $(BUILD)/aes_small_cbcenc.o $(BUILD)/aes_small_ctr.o $(BUILD)/aes_small_dec.o $(BUILD)/aes_small_enc.o $(BUILD)/chacha20_ct.o $(BUILD)/des_ct.o $(BUILD)/des_ct_cbcdec.o $(BUILD)/des_ct_cbcenc.o $(BUILD)/des_support.o $(BUILD)/des_tab.o $(BUILD)/des_tab_cbcdec.o $(BUILD)/des_tab_cbcenc.o $(BUILD)/poly1305_ctmul.o $(BUILD)/poly1305_ctmul32.o $(BUILD)/poly1305_i15.o OBJX509 = $(BUILD)/skey_decoder.o $(BUILD)/x509_decoder.o $(BUILD)/x509_knownkey.o $(BUILD)/x509_minimal.o $(BUILD)/x509_minimal_full.o OBJ = $(OBJCODEC) $(OBJEC) $(OBJHASH) $(OBJINT15) $(OBJINT31) $(OBJINT32) $(OBJMAC) $(OBJRAND) $(OBJRSA) $(OBJSSL) $(OBJSYMCIPHER) $(OBJX509) OBJBRSSL = $(BUILD)/brssl.o $(BUILD)/certs.o $(BUILD)/chain.o $(BUILD)/client.o $(BUILD)/errors.o $(BUILD)/files.o $(BUILD)/keys.o $(BUILD)/names.o $(BUILD)/server.o $(BUILD)/skey.o $(BUILD)/sslio.o $(BUILD)/ta.o $(BUILD)/vector.o $(BUILD)/verify.o $(BUILD)/xmem.o @@ -606,6 +606,12 @@ $(BUILD)/des_tab_cbcenc.o: src/symcipher/des_tab_cbcenc.c $(HEADERS) $(BUILD)/poly1305_ctmul.o: src/symcipher/poly1305_ctmul.c $(HEADERS) $(CC) $(CFLAGS) -c -o $(BUILD)/poly1305_ctmul.o src/symcipher/poly1305_ctmul.c +$(BUILD)/poly1305_ctmul32.o: src/symcipher/poly1305_ctmul32.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $(BUILD)/poly1305_ctmul32.o src/symcipher/poly1305_ctmul32.c + +$(BUILD)/poly1305_i15.o: src/symcipher/poly1305_i15.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $(BUILD)/poly1305_i15.o src/symcipher/poly1305_i15.c + $(BUILD)/skey_decoder.o: src/x509/skey_decoder.c $(HEADERS) $(CC) $(CFLAGS) -c -o $(BUILD)/skey_decoder.o src/x509/skey_decoder.c diff --git a/inc/bearssl_block.h b/inc/bearssl_block.h index 547f3b2..5281dcf 100644 --- a/inc/bearssl_block.h +++ b/inc/bearssl_block.h @@ -1323,4 +1323,48 @@ void br_poly1305_ctmul_run(const void *key, const void *iv, void *data, size_t len, const void *aad, size_t aad_len, void *tag, br_chacha20_run ichacha, int encrypt); +/** + * \brief ChaCha20+Poly1305 AEAD implementation (pure 32-bit multiplications). + * + * \see br_poly1305_run + * + * \param key secret key (32 bytes). + * \param iv nonce (12 bytes). + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + * \param aad additional authenticated data. + * \param aad_len length of additional authenticated data (in bytes). + * \param tag output buffer for the authentication tag. + * \param ichacha implementation of ChaCha20. + * \param encrypt non-zero for encryption, zero for decryption. + */ +void br_poly1305_ctmul32_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt); + +/** + * \brief ChaCha20+Poly1305 AEAD implementation (i15). + * + * This implementation relies on the generic big integer code "i15" + * (which uses pure 32-bit multiplications). As such, it may save a + * little code footprint in a context where "i15" is already included + * (e.g. for elliptic curves or for RSA); however, it is also + * substantially slower than the ctmul and ctmul32 implementations. + * + * \see br_poly1305_run + * + * \param key secret key (32 bytes). + * \param iv nonce (12 bytes). + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + * \param aad additional authenticated data. + * \param aad_len length of additional authenticated data (in bytes). + * \param tag output buffer for the authentication tag. + * \param ichacha implementation of ChaCha20. + * \param encrypt non-zero for encryption, zero for decryption. + */ +void br_poly1305_i15_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt); + #endif diff --git a/src/inner.h b/src/inner.h index 8417b24..fd764bc 100644 --- a/src/inner.h +++ b/src/inner.h @@ -571,16 +571,17 @@ MUL31_lo(uint32_t x, uint32_t y) #endif /* - * Multiply two words together; each word may contain up to 15 bits of - * data. If BR_CT_MUL15 is non-zero, then the macro will contain some - * extra operations that help in making the operation constant-time on - * some platforms, where the basic 32-bit multiplication is not - * constant-time. + * Multiply two words together; the sum of the lengths of the two + * operands must not exceed 31 (for instance, one operand may use 16 + * bits if the other fits on 15). If BR_CT_MUL15 is non-zero, then the + * macro will contain some extra operations that help in making the + * operation constant-time on some platforms, where the basic 32-bit + * multiplication is not constant-time. */ #if BR_CT_MUL15 #define MUL15(x, y) (((uint32_t)(x) | (uint32_t)0x80000000) \ * ((uint32_t)(y) | (uint32_t)0x80000000) \ - & (uint32_t)0x3FFFFFFF) + & (uint32_t)0x7FFFFFFF) #else #define MUL15(x, y) ((uint32_t)(x) * (uint32_t)(y)) #endif diff --git a/src/symcipher/poly1305_ctmul32.c b/src/symcipher/poly1305_ctmul32.c new file mode 100644 index 0000000..15d9635 --- /dev/null +++ b/src/symcipher/poly1305_ctmul32.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Perform the inner processing of blocks for Poly1305. + */ +static void +poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len) +{ + /* + * Implementation notes: we split the 130-bit values into ten + * 13-bit words. This gives us some space for carries and allows + * using only 32x32->32 multiplications, which are way faster than + * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also + * help in making constant-time code on the Cortex-M3. + * + * Since we compute modulo 2^130-5, the "upper words" become + * low words with a factor of 5; that is, x*2^130 = x*5 mod p. + * This has already been integrated in the r[] array, which + * is extended to the 0..18 range. + * + * In each loop iteration, a[] and r[] words are 13-bit each, + * except a[1] which may use 14 bits. + */ + const unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16]; + uint32_t b[10]; + unsigned u, v; + uint32_t z, cc1, cc2; + + /* + * If there is a partial block, right-pad it with zeros. + */ + if (len < 16) { + memset(tmp, 0, sizeof tmp); + memcpy(tmp, buf, len); + buf = tmp; + len = 16; + } + + /* + * Decode next block and apply the "high bit"; that value + * is added to the accumulator. + */ + v = br_dec16le(buf); + a[0] += v & 0x01FFF; + v >>= 13; + v |= buf[2] << 3; + v |= buf[3] << 11; + a[1] += v & 0x01FFF; + v >>= 13; + v |= buf[4] << 6; + a[2] += v & 0x01FFF; + v >>= 13; + v |= buf[5] << 1; + v |= buf[6] << 9; + a[3] += v & 0x01FFF; + v >>= 13; + v |= buf[7] << 4; + v |= buf[8] << 12; + a[4] += v & 0x01FFF; + v >>= 13; + v |= buf[9] << 7; + a[5] += v & 0x01FFF; + v >>= 13; + v |= buf[10] << 2; + v |= buf[11] << 10; + a[6] += v & 0x01FFF; + v >>= 13; + v |= buf[12] << 5; + a[7] += v & 0x01FFF; + v = br_dec16le(buf + 13); + a[8] += v & 0x01FFF; + v >>= 13; + v |= buf[15] << 3; + a[9] += v | 0x00800; + + /* + * At that point, all a[] values fit on 14 bits, while + * all r[] values fit on 13 bits. Thus products fit on + * 27 bits, and we can accumulate up to 31 of them in + * a 32-bit word and still have some room for carries. + */ + + /* + * Now a[] contains words with values up to 14 bits each. + * We perform the multiplication with r[]. + * + * The extended words of r[] may be larger than 13 bits + * (they are 5 times a 13-bit word) so the full summation + * may yield values up to 46 times a 27-bit word, which + * does not fit on a 32-bit word. To avoid that issue, we + * must split the loop below in two, with a carry + * propagation operation in the middle. + */ + cc1 = 0; + for (u = 0; u < 10; u ++) { + uint32_t s; + + s = cc1 + + MUL15(a[0], r[u + 9 - 0]) + + MUL15(a[1], r[u + 9 - 1]) + + MUL15(a[2], r[u + 9 - 2]) + + MUL15(a[3], r[u + 9 - 3]) + + MUL15(a[4], r[u + 9 - 4]); + b[u] = s & 0x1FFF; + cc1 = s >> 13; + } + cc2 = 0; + for (u = 0; u < 10; u ++) { + uint32_t s; + + s = b[u] + cc2 + + MUL15(a[5], r[u + 9 - 5]) + + MUL15(a[6], r[u + 9 - 6]) + + MUL15(a[7], r[u + 9 - 7]) + + MUL15(a[8], r[u + 9 - 8]) + + MUL15(a[9], r[u + 9 - 9]); + b[u] = s & 0x1FFF; + cc2 = s >> 13; + } + memcpy(a, b, sizeof b); + + /* + * The two carries "loop back" with a factor of 5. We + * propagate them into a[0] and a[1]. + */ + z = cc1 + cc2; + z += (z << 2) + a[0]; + a[0] = z & 0x1FFF; + a[1] += z >> 13; + + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +void +br_poly1305_ctmul32_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint32_t z, r[19], acc[10], cc, ctl; + int i; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Decode the 'r' value into 13-bit words, with the "clamping" + * operation applied. + */ + z = br_dec32le(pkey) & 0x03FFFFFF; + r[9] = z & 0x1FFF; + r[10] = z >> 13; + z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; + r[11] = z & 0x1FFF; + r[12] = z >> 13; + z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; + r[13] = z & 0x1FFF; + r[14] = z >> 13; + z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; + r[15] = z & 0x1FFF; + r[16] = z >> 13; + z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; + r[17] = z & 0x1FFF; + r[18] = z >> 13; + + /* + * Extend r[] with the 5x factor pre-applied. + */ + for (i = 0; i < 9; i ++) { + r[i] = MUL15(5, r[i + 10]); + } + + /* + * Accumulator is 0. + */ + memset(acc, 0, sizeof acc); + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner(acc, r, foot, sizeof foot); + + /* + * Finalise modular reduction. This is done with carry propagation + * and applying the '2^130 = -5 mod p' rule. Note that the output + * of poly1035_inner() is already mostly reduced, since only + * acc[1] may be (very slightly) above 2^13. A single loop back + * to acc[1] will be enough to make the value fit in 130 bits. + */ + cc = 0; + for (i = 1; i < 10; i ++) { + z = acc[i] + cc; + acc[i] = z & 0x1FFF; + cc = z >> 13; + } + z = acc[0] + cc + (cc << 2); + acc[0] = z & 0x1FFF; + acc[1] += z >> 13; + + /* + * We may still have a value in the 2^130-5..2^130-1 range, in + * which case we must reduce it again. The code below selects, + * in constant-time, between 'acc' and 'acc-p', + */ + ctl = GT(acc[0], 0x1FFA); + for (i = 1; i < 10; i ++) { + ctl &= EQ(acc[i], 0x1FFF); + } + acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]); + for (i = 1; i < 10; i ++) { + acc[i] &= ~(-ctl); + } + + /* + * Convert back the accumulator to 32-bit words, and add the + * 's' value (second half of pkey[]). That addition is done + * modulo 2^128. + */ + z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16); + br_enc16le((unsigned char *)tag, z & 0xFFFF); + z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18); + br_enc16le((unsigned char *)tag + 2, z & 0xFFFF); + z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20); + br_enc16le((unsigned char *)tag + 4, z & 0xFFFF); + z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22); + br_enc16le((unsigned char *)tag + 6, z & 0xFFFF); + z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24); + br_enc16le((unsigned char *)tag + 8, z & 0xFFFF); + z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26); + br_enc16le((unsigned char *)tag + 10, z & 0xFFFF); + z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28); + br_enc16le((unsigned char *)tag + 12, z & 0xFFFF); + z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30); + br_enc16le((unsigned char *)tag + 14, z & 0xFFFF); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} diff --git a/src/symcipher/poly1305_i15.c b/src/symcipher/poly1305_i15.c new file mode 100644 index 0000000..6f89212 --- /dev/null +++ b/src/symcipher/poly1305_i15.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * This is a "reference" implementation of Poly1305 that uses the + * generic "i15" code for big integers. It is slow, but it handles all + * big-integer operations with generic code, thereby avoiding most + * tricky situations with carry propagation and modular reduction. + */ + +/* + * Modulus: 2^130-5. + */ +static const uint16_t P1305[] = { + 0x008A, + 0x7FFB, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x03FF +}; + +/* + * -p mod 2^15. + */ +#define P0I 0x4CCD + +/* + * R^2 mod p, for conversion to Montgomery representation (R = 2^135, + * since we use 9 words of 15 bits each, and 15*9 = 135). + */ +static const uint16_t R2[] = { + 0x008A, + 0x6400, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 +}; + +/* + * Perform the inner processing of blocks for Poly1305. The "r" array + * is in Montgomery representation, while the "a" array is not. + */ +static void +poly1305_inner(uint16_t *a, const uint16_t *r, const void *data, size_t len) +{ + const unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16], rev[16]; + uint16_t b[10]; + uint32_t ctl; + int i; + + /* + * If there is a partial block, right-pad it with zeros. + */ + if (len < 16) { + memset(tmp, 0, sizeof tmp); + memcpy(tmp, buf, len); + buf = tmp; + len = 16; + } + + /* + * Decode next block and apply the "high bit". Since + * decoding is little-endian, we must byte-swap the buffer. + */ + for (i = 0; i < 16; i ++) { + rev[i] = buf[15 - i]; + } + br_i15_decode_mod(b, rev, sizeof rev, P1305); + b[9] |= 0x0100; + + /* + * Add the accumulator to the decoded block (modular + * addition). + */ + ctl = br_i15_add(b, a, 1); + ctl |= NOT(br_i15_sub(b, P1305, 0)); + br_i15_sub(b, P1305, ctl); + + /* + * Multiply by r, result is the new accumulator value. + */ + br_i15_montymul(a, b, r, P1305, P0I); + + buf += 16; + len -= 16; + } +} + +/* + * Byteswap a 16-byte value. + */ +static void +byteswap16(unsigned char *buf) +{ + int i; + + for (i = 0; i < 8; i ++) { + unsigned x; + + x = buf[i]; + buf[i] = buf[15 - i]; + buf[15 - i] = x; + } +} + +/* see bearssl_block.h */ +void +br_poly1305_i15_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint16_t t[10], r[10], acc[10]; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Apply the "clamping" operation on the encoded 'r' value. + */ + pkey[ 3] &= 0x0F; + pkey[ 7] &= 0x0F; + pkey[11] &= 0x0F; + pkey[15] &= 0x0F; + pkey[ 4] &= 0xFC; + pkey[ 8] &= 0xFC; + pkey[12] &= 0xFC; + + /* + * Decode the clamped 'r' value. Decoding should use little-endian + * so we must byteswap the value first. + */ + byteswap16(pkey); + br_i15_decode_mod(t, pkey, 16, P1305); + + /* + * Convert 'r' to Montgomery representation. + */ + br_i15_montymul(r, t, R2, P1305, P0I); + + /* + * Accumulator is 0. + */ + br_i15_zero(acc, 0x8A); + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner(acc, r, foot, sizeof foot); + + /* + * Decode the value 's'. Again, a byteswap is needed. + */ + byteswap16(pkey + 16); + br_i15_decode_mod(t, pkey + 16, 16, P1305); + + /* + * Add the value 's' to the accumulator. That addition is done + * modulo 2^128, so we just ignore the carry. + */ + br_i15_add(acc, t, 1); + + /* + * Encode the result (128 low bits) to the tag. Encoding should + * be little-endian. + */ + br_i15_encode(tag, 16, acc); + byteswap16(tag); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} diff --git a/test/test_crypto.c b/test/test_crypto.c index bed4927..b62ed3f 100644 --- a/test/test_crypto.c +++ b/test/test_crypto.c @@ -4113,11 +4113,13 @@ static const struct { }; static void -test_Poly1305_ctmul(void) +test_Poly1305_inner(const char *name, br_poly1305_run ipoly, + br_poly1305_run iref) { size_t u; + br_hmac_drbg_context rng; - printf("Test Poly1305_ctmul: "); + printf("Test %s: ", name); fflush(stdout); for (u = 0; KAT_POLY1305[u].skey; u ++) { @@ -4133,7 +4135,7 @@ test_Poly1305_ctmul(void) hextobin(tag, KAT_POLY1305[u].stag); memcpy(data, plain, len); - br_poly1305_ctmul_run(key, nonce, data, len, + ipoly(key, nonce, data, len, aad, aad_len, tmp, br_chacha20_ct_run, 1); if (memcmp(data, cipher, len) != 0) { fprintf(stderr, "ChaCha20+Poly1305 KAT failed (1)\n"); @@ -4143,7 +4145,7 @@ test_Poly1305_ctmul(void) fprintf(stderr, "ChaCha20+Poly1305 KAT failed (2)\n"); exit(EXIT_FAILURE); } - br_poly1305_ctmul_run(key, nonce, data, len, + ipoly(key, nonce, data, len, aad, aad_len, tmp, br_chacha20_ct_run, 0); if (memcmp(data, plain, len) != 0) { fprintf(stderr, "ChaCha20+Poly1305 KAT failed (3)\n"); @@ -4158,10 +4160,66 @@ test_Poly1305_ctmul(void) fflush(stdout); } + printf(" "); + fflush(stdout); + + /* + * We compare the "ipoly" and "iref" implementations together on + * a bunch of pseudo-random messages. + */ + br_hmac_drbg_init(&rng, &br_sha256_vtable, "seed for Poly1305", 17); + for (u = 0; u < 100; u ++) { + unsigned char plain[100], aad[100], tmp[100]; + unsigned char key[32], iv[12], tag1[16], tag2[16]; + + br_hmac_drbg_generate(&rng, key, sizeof key); + br_hmac_drbg_generate(&rng, iv, sizeof iv); + br_hmac_drbg_generate(&rng, plain, u); + br_hmac_drbg_generate(&rng, aad, u); + memcpy(tmp, plain, u); + memset(tmp + u, 0xFF, (sizeof tmp) - u); + ipoly(key, iv, tmp, u, aad, u, tag1, + &br_chacha20_ct_run, 1); + memset(tmp + u, 0x00, (sizeof tmp) - u); + iref(key, iv, tmp, u, aad, u, tag2, + &br_chacha20_ct_run, 0); + if (memcmp(tmp, plain, u) != 0) { + fprintf(stderr, "cross enc/dec failed\n"); + exit(EXIT_FAILURE); + } + if (memcmp(tag1, tag2, sizeof tag1) != 0) { + fprintf(stderr, "cross MAC failed\n"); + exit(EXIT_FAILURE); + } + printf("."); + fflush(stdout); + } + printf(" done.\n"); fflush(stdout); } +static void +test_Poly1305_ctmul(void) +{ + test_Poly1305_inner("Poly1305_ctmul", &br_poly1305_ctmul_run, + &br_poly1305_i15_run); +} + +static void +test_Poly1305_ctmul32(void) +{ + test_Poly1305_inner("Poly1305_ctmul32", &br_poly1305_ctmul32_run, + &br_poly1305_i15_run); +} + +static void +test_Poly1305_i15(void) +{ + test_Poly1305_inner("Poly1305_i15", &br_poly1305_i15_run, + &br_poly1305_ctmul_run); +} + /* * A 1024-bit RSA key, generated with OpenSSL. */ @@ -5380,6 +5438,8 @@ static const struct { STU(DES_ct), STU(ChaCha20_ct), STU(Poly1305_ctmul), + STU(Poly1305_ctmul32), + STU(Poly1305_i15), STU(RSA_i15), STU(RSA_i31), STU(RSA_i32), diff --git a/test/test_speed.c b/test/test_speed.c index 5e17714..a82eec7 100644 --- a/test/test_speed.c +++ b/test/test_speed.c @@ -347,6 +347,19 @@ test_speed_poly1305_ctmul(void) test_speed_poly1305_inner("Poly1305 (ctmul)", &br_poly1305_ctmul_run); } +static void +test_speed_poly1305_ctmul32(void) +{ + test_speed_poly1305_inner("Poly1305 (ctmul32)", + &br_poly1305_ctmul32_run); +} + +static void +test_speed_poly1305_i15(void) +{ + test_speed_poly1305_inner("Poly1305 (i15)", &br_poly1305_i15_run); +} + static const unsigned char RSA_N[] = { 0xE9, 0xF2, 0x4A, 0x2F, 0x96, 0xDF, 0x0A, 0x23, 0x01, 0x85, 0xF1, 0x2C, 0xB2, 0xA8, 0xEF, 0x23, @@ -1166,6 +1179,8 @@ static const struct { STU(ghash_ctmul64), STU(poly1305_ctmul), + STU(poly1305_ctmul32), + STU(poly1305_i15), STU(rsa_i15), STU(rsa_i31),