Extra Makefile hack for compatibility with OpenBSD 'make'.
[BearSSL] / src / symcipher / poly1305_ctmul32.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * Perform the inner processing of blocks for Poly1305.
29 */
30 static void
31 poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)
32 {
33 /*
34 * Implementation notes: we split the 130-bit values into ten
35 * 13-bit words. This gives us some space for carries and allows
36 * using only 32x32->32 multiplications, which are way faster than
37 * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
38 * help in making constant-time code on the Cortex-M3.
39 *
40 * Since we compute modulo 2^130-5, the "upper words" become
41 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
42 * This has already been integrated in the r[] array, which
43 * is extended to the 0..18 range.
44 *
45 * In each loop iteration, a[] and r[] words are 13-bit each,
46 * except a[1] which may use 14 bits.
47 */
48 const unsigned char *buf;
49
50 buf = data;
51 while (len > 0) {
52 unsigned char tmp[16];
53 uint32_t b[10];
54 unsigned u, v;
55 uint32_t z, cc1, cc2;
56
57 /*
58 * If there is a partial block, right-pad it with zeros.
59 */
60 if (len < 16) {
61 memset(tmp, 0, sizeof tmp);
62 memcpy(tmp, buf, len);
63 buf = tmp;
64 len = 16;
65 }
66
67 /*
68 * Decode next block and apply the "high bit"; that value
69 * is added to the accumulator.
70 */
71 v = br_dec16le(buf);
72 a[0] += v & 0x01FFF;
73 v >>= 13;
74 v |= buf[2] << 3;
75 v |= buf[3] << 11;
76 a[1] += v & 0x01FFF;
77 v >>= 13;
78 v |= buf[4] << 6;
79 a[2] += v & 0x01FFF;
80 v >>= 13;
81 v |= buf[5] << 1;
82 v |= buf[6] << 9;
83 a[3] += v & 0x01FFF;
84 v >>= 13;
85 v |= buf[7] << 4;
86 v |= buf[8] << 12;
87 a[4] += v & 0x01FFF;
88 v >>= 13;
89 v |= buf[9] << 7;
90 a[5] += v & 0x01FFF;
91 v >>= 13;
92 v |= buf[10] << 2;
93 v |= buf[11] << 10;
94 a[6] += v & 0x01FFF;
95 v >>= 13;
96 v |= buf[12] << 5;
97 a[7] += v & 0x01FFF;
98 v = br_dec16le(buf + 13);
99 a[8] += v & 0x01FFF;
100 v >>= 13;
101 v |= buf[15] << 3;
102 a[9] += v | 0x00800;
103
104 /*
105 * At that point, all a[] values fit on 14 bits, while
106 * all r[] values fit on 13 bits. Thus products fit on
107 * 27 bits, and we can accumulate up to 31 of them in
108 * a 32-bit word and still have some room for carries.
109 */
110
111 /*
112 * Now a[] contains words with values up to 14 bits each.
113 * We perform the multiplication with r[].
114 *
115 * The extended words of r[] may be larger than 13 bits
116 * (they are 5 times a 13-bit word) so the full summation
117 * may yield values up to 46 times a 27-bit word, which
118 * does not fit on a 32-bit word. To avoid that issue, we
119 * must split the loop below in two, with a carry
120 * propagation operation in the middle.
121 */
122 cc1 = 0;
123 for (u = 0; u < 10; u ++) {
124 uint32_t s;
125
126 s = cc1
127 + MUL15(a[0], r[u + 9 - 0])
128 + MUL15(a[1], r[u + 9 - 1])
129 + MUL15(a[2], r[u + 9 - 2])
130 + MUL15(a[3], r[u + 9 - 3])
131 + MUL15(a[4], r[u + 9 - 4]);
132 b[u] = s & 0x1FFF;
133 cc1 = s >> 13;
134 }
135 cc2 = 0;
136 for (u = 0; u < 10; u ++) {
137 uint32_t s;
138
139 s = b[u] + cc2
140 + MUL15(a[5], r[u + 9 - 5])
141 + MUL15(a[6], r[u + 9 - 6])
142 + MUL15(a[7], r[u + 9 - 7])
143 + MUL15(a[8], r[u + 9 - 8])
144 + MUL15(a[9], r[u + 9 - 9]);
145 b[u] = s & 0x1FFF;
146 cc2 = s >> 13;
147 }
148 memcpy(a, b, sizeof b);
149
150 /*
151 * The two carries "loop back" with a factor of 5. We
152 * propagate them into a[0] and a[1].
153 */
154 z = cc1 + cc2;
155 z += (z << 2) + a[0];
156 a[0] = z & 0x1FFF;
157 a[1] += z >> 13;
158
159 buf += 16;
160 len -= 16;
161 }
162 }
163
164 /* see bearssl_block.h */
165 void
166 br_poly1305_ctmul32_run(const void *key, const void *iv,
167 void *data, size_t len, const void *aad, size_t aad_len,
168 void *tag, br_chacha20_run ichacha, int encrypt)
169 {
170 unsigned char pkey[32], foot[16];
171 uint32_t z, r[19], acc[10], cc, ctl;
172 int i;
173
174 /*
175 * Compute the MAC key. The 'r' value is the first 16 bytes of
176 * pkey[].
177 */
178 memset(pkey, 0, sizeof pkey);
179 ichacha(key, iv, 0, pkey, sizeof pkey);
180
181 /*
182 * If encrypting, ChaCha20 must run first, followed by Poly1305.
183 * When decrypting, the operations are reversed.
184 */
185 if (encrypt) {
186 ichacha(key, iv, 1, data, len);
187 }
188
189 /*
190 * Run Poly1305. We must process the AAD, then ciphertext, then
191 * the footer (with the lengths). Note that the AAD and ciphertext
192 * are meant to be padded with zeros up to the next multiple of 16,
193 * and the length of the footer is 16 bytes as well.
194 */
195
196 /*
197 * Decode the 'r' value into 13-bit words, with the "clamping"
198 * operation applied.
199 */
200 z = br_dec32le(pkey) & 0x03FFFFFF;
201 r[9] = z & 0x1FFF;
202 r[10] = z >> 13;
203 z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03;
204 r[11] = z & 0x1FFF;
205 r[12] = z >> 13;
206 z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF;
207 r[13] = z & 0x1FFF;
208 r[14] = z >> 13;
209 z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF;
210 r[15] = z & 0x1FFF;
211 r[16] = z >> 13;
212 z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
213 r[17] = z & 0x1FFF;
214 r[18] = z >> 13;
215
216 /*
217 * Extend r[] with the 5x factor pre-applied.
218 */
219 for (i = 0; i < 9; i ++) {
220 r[i] = MUL15(5, r[i + 10]);
221 }
222
223 /*
224 * Accumulator is 0.
225 */
226 memset(acc, 0, sizeof acc);
227
228 /*
229 * Process the additional authenticated data, ciphertext, and
230 * footer in due order.
231 */
232 br_enc64le(foot, (uint64_t)aad_len);
233 br_enc64le(foot + 8, (uint64_t)len);
234 poly1305_inner(acc, r, aad, aad_len);
235 poly1305_inner(acc, r, data, len);
236 poly1305_inner(acc, r, foot, sizeof foot);
237
238 /*
239 * Finalise modular reduction. This is done with carry propagation
240 * and applying the '2^130 = -5 mod p' rule. Note that the output
241 * of poly1035_inner() is already mostly reduced, since only
242 * acc[1] may be (very slightly) above 2^13. A single loop back
243 * to acc[1] will be enough to make the value fit in 130 bits.
244 */
245 cc = 0;
246 for (i = 1; i < 10; i ++) {
247 z = acc[i] + cc;
248 acc[i] = z & 0x1FFF;
249 cc = z >> 13;
250 }
251 z = acc[0] + cc + (cc << 2);
252 acc[0] = z & 0x1FFF;
253 acc[1] += z >> 13;
254
255 /*
256 * We may still have a value in the 2^130-5..2^130-1 range, in
257 * which case we must reduce it again. The code below selects,
258 * in constant-time, between 'acc' and 'acc-p',
259 */
260 ctl = GT(acc[0], 0x1FFA);
261 for (i = 1; i < 10; i ++) {
262 ctl &= EQ(acc[i], 0x1FFF);
263 }
264 acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
265 for (i = 1; i < 10; i ++) {
266 acc[i] &= ~(-ctl);
267 }
268
269 /*
270 * Convert back the accumulator to 32-bit words, and add the
271 * 's' value (second half of pkey[]). That addition is done
272 * modulo 2^128.
273 */
274 z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
275 br_enc16le((unsigned char *)tag, z & 0xFFFF);
276 z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
277 br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
278 z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
279 br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
280 z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
281 br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
282 z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
283 br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
284 z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
285 br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
286 z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
287 br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
288 z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
289 br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
290
291 /*
292 * If decrypting, then ChaCha20 runs _after_ Poly1305.
293 */
294 if (!encrypt) {
295 ichacha(key, iv, 1, data, len);
296 }
297 }