2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * Perform the inner processing of blocks for Poly1305.
31 poly1305_inner(uint32_t *a
, const uint32_t *r
, const void *data
, size_t len
)
34 * Implementation notes: we split the 130-bit values into ten
35 * 13-bit words. This gives us some space for carries and allows
36 * using only 32x32->32 multiplications, which are way faster than
37 * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
38 * help in making constant-time code on the Cortex-M3.
40 * Since we compute modulo 2^130-5, the "upper words" become
41 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
42 * This has already been integrated in the r[] array, which
43 * is extended to the 0..18 range.
45 * In each loop iteration, a[] and r[] words are 13-bit each,
46 * except a[1] which may use 14 bits.
48 const unsigned char *buf
;
52 unsigned char tmp
[16];
58 * If there is a partial block, right-pad it with zeros.
61 memset(tmp
, 0, sizeof tmp
);
62 memcpy(tmp
, buf
, len
);
68 * Decode next block and apply the "high bit"; that value
69 * is added to the accumulator.
98 v
= br_dec16le(buf
+ 13);
105 * At that point, all a[] values fit on 14 bits, while
106 * all r[] values fit on 13 bits. Thus products fit on
107 * 27 bits, and we can accumulate up to 31 of them in
108 * a 32-bit word and still have some room for carries.
112 * Now a[] contains words with values up to 14 bits each.
113 * We perform the multiplication with r[].
115 * The extended words of r[] may be larger than 13 bits
116 * (they are 5 times a 13-bit word) so the full summation
117 * may yield values up to 46 times a 27-bit word, which
118 * does not fit on a 32-bit word. To avoid that issue, we
119 * must split the loop below in two, with a carry
120 * propagation operation in the middle.
123 for (u
= 0; u
< 10; u
++) {
127 + MUL15(a
[0], r
[u
+ 9 - 0])
128 + MUL15(a
[1], r
[u
+ 9 - 1])
129 + MUL15(a
[2], r
[u
+ 9 - 2])
130 + MUL15(a
[3], r
[u
+ 9 - 3])
131 + MUL15(a
[4], r
[u
+ 9 - 4]);
136 for (u
= 0; u
< 10; u
++) {
140 + MUL15(a
[5], r
[u
+ 9 - 5])
141 + MUL15(a
[6], r
[u
+ 9 - 6])
142 + MUL15(a
[7], r
[u
+ 9 - 7])
143 + MUL15(a
[8], r
[u
+ 9 - 8])
144 + MUL15(a
[9], r
[u
+ 9 - 9]);
148 memcpy(a
, b
, sizeof b
);
151 * The two carries "loop back" with a factor of 5. We
152 * propagate them into a[0] and a[1].
155 z
+= (z
<< 2) + a
[0];
164 /* see bearssl_block.h */
166 br_poly1305_ctmul32_run(const void *key
, const void *iv
,
167 void *data
, size_t len
, const void *aad
, size_t aad_len
,
168 void *tag
, br_chacha20_run ichacha
, int encrypt
)
170 unsigned char pkey
[32], foot
[16];
171 uint32_t z
, r
[19], acc
[10], cc
, ctl
;
175 * Compute the MAC key. The 'r' value is the first 16 bytes of
178 memset(pkey
, 0, sizeof pkey
);
179 ichacha(key
, iv
, 0, pkey
, sizeof pkey
);
182 * If encrypting, ChaCha20 must run first, followed by Poly1305.
183 * When decrypting, the operations are reversed.
186 ichacha(key
, iv
, 1, data
, len
);
190 * Run Poly1305. We must process the AAD, then ciphertext, then
191 * the footer (with the lengths). Note that the AAD and ciphertext
192 * are meant to be padded with zeros up to the next multiple of 16,
193 * and the length of the footer is 16 bytes as well.
197 * Decode the 'r' value into 13-bit words, with the "clamping"
200 z
= br_dec32le(pkey
) & 0x03FFFFFF;
203 z
= (br_dec32le(pkey
+ 3) >> 2) & 0x03FFFF03;
206 z
= (br_dec32le(pkey
+ 6) >> 4) & 0x03FFC0FF;
209 z
= (br_dec32le(pkey
+ 9) >> 6) & 0x03F03FFF;
212 z
= (br_dec32le(pkey
+ 12) >> 8) & 0x000FFFFF;
217 * Extend r[] with the 5x factor pre-applied.
219 for (i
= 0; i
< 9; i
++) {
220 r
[i
] = MUL15(5, r
[i
+ 10]);
226 memset(acc
, 0, sizeof acc
);
229 * Process the additional authenticated data, ciphertext, and
230 * footer in due order.
232 br_enc64le(foot
, (uint64_t)aad_len
);
233 br_enc64le(foot
+ 8, (uint64_t)len
);
234 poly1305_inner(acc
, r
, aad
, aad_len
);
235 poly1305_inner(acc
, r
, data
, len
);
236 poly1305_inner(acc
, r
, foot
, sizeof foot
);
239 * Finalise modular reduction. This is done with carry propagation
240 * and applying the '2^130 = -5 mod p' rule. Note that the output
241 * of poly1035_inner() is already mostly reduced, since only
242 * acc[1] may be (very slightly) above 2^13. A single loop back
243 * to acc[1] will be enough to make the value fit in 130 bits.
246 for (i
= 1; i
< 10; i
++) {
251 z
= acc
[0] + cc
+ (cc
<< 2);
256 * We may still have a value in the 2^130-5..2^130-1 range, in
257 * which case we must reduce it again. The code below selects,
258 * in constant-time, between 'acc' and 'acc-p',
260 ctl
= GT(acc
[0], 0x1FFA);
261 for (i
= 1; i
< 10; i
++) {
262 ctl
&= EQ(acc
[i
], 0x1FFF);
264 acc
[0] = MUX(ctl
, acc
[0] - 0x1FFB, acc
[0]);
265 for (i
= 1; i
< 10; i
++) {
270 * Convert back the accumulator to 32-bit words, and add the
271 * 's' value (second half of pkey[]). That addition is done
274 z
= acc
[0] + (acc
[1] << 13) + br_dec16le(pkey
+ 16);
275 br_enc16le((unsigned char *)tag
, z
& 0xFFFF);
276 z
= (z
>> 16) + (acc
[2] << 10) + br_dec16le(pkey
+ 18);
277 br_enc16le((unsigned char *)tag
+ 2, z
& 0xFFFF);
278 z
= (z
>> 16) + (acc
[3] << 7) + br_dec16le(pkey
+ 20);
279 br_enc16le((unsigned char *)tag
+ 4, z
& 0xFFFF);
280 z
= (z
>> 16) + (acc
[4] << 4) + br_dec16le(pkey
+ 22);
281 br_enc16le((unsigned char *)tag
+ 6, z
& 0xFFFF);
282 z
= (z
>> 16) + (acc
[5] << 1) + (acc
[6] << 14) + br_dec16le(pkey
+ 24);
283 br_enc16le((unsigned char *)tag
+ 8, z
& 0xFFFF);
284 z
= (z
>> 16) + (acc
[7] << 11) + br_dec16le(pkey
+ 26);
285 br_enc16le((unsigned char *)tag
+ 10, z
& 0xFFFF);
286 z
= (z
>> 16) + (acc
[8] << 8) + br_dec16le(pkey
+ 28);
287 br_enc16le((unsigned char *)tag
+ 12, z
& 0xFFFF);
288 z
= (z
>> 16) + (acc
[9] << 5) + br_dec16le(pkey
+ 30);
289 br_enc16le((unsigned char *)tag
+ 14, z
& 0xFFFF);
292 * If decrypting, then ChaCha20 runs _after_ Poly1305.
295 ichacha(key
, iv
, 1, data
, len
);