98865ceaf940a1544bba390050a192f18fbec48b
[BearSSL] / src / ec / ec_p256_i15.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #else
41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
42 #endif
43
44 /*
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
47 * returned.
48 */
49 static uint32_t
50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51 {
52 uint32_t acc;
53 int acc_len;
54
55 acc = 0;
56 acc_len = 0;
57 while (len -- > 0) {
58 acc |= (uint32_t)src[len] << acc_len;
59 acc_len += 8;
60 if (acc_len >= 13) {
61 *dst ++ = acc & 0x1FFF;
62 acc >>= 13;
63 acc_len -= 13;
64 }
65 }
66 return acc;
67 }
68
69 /*
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
73 */
74 static void
75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76 {
77 uint32_t acc;
78 int acc_len;
79
80 acc = 0;
81 acc_len = 0;
82 while (len -- > 0) {
83 if (acc_len < 8) {
84 acc |= (*src ++) << acc_len;
85 acc_len += 13;
86 }
87 dst[len] = (unsigned char)acc;
88 acc >>= 8;
89 acc_len -= 8;
90 }
91 }
92
93 /*
94 * Normalise an array of words to a strict 13 bits per word. Returned
95 * value is the resulting carry. The source (w) and destination (d)
96 * arrays may be identical, but shall not overlap partially.
97 */
98 static inline uint32_t
99 norm13(uint32_t *d, const uint32_t *w, size_t len)
100 {
101 size_t u;
102 uint32_t cc;
103
104 cc = 0;
105 for (u = 0; u < len; u ++) {
106 int32_t z;
107
108 z = w[u] + cc;
109 d[u] = z & 0x1FFF;
110 cc = ARSH(z, 13);
111 }
112 return cc;
113 }
114
115 /*
116 * mul20() multiplies two 260-bit integers together. Each word must fit
117 * on 13 bits; source operands use 20 words, destination operand
118 * receives 40 words. All overlaps allowed.
119 *
120 *
121 */
122
123 #if BR_SLOW_MUL15
124
125 static void
126 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
127 {
128 /*
129 * Two-level Karatsuba: turns a 20x20 multiplication into
130 * nine 5x5 multiplications. We use 13-bit words but do not
131 * propagate carries immediately, so words may expand:
132 *
133 * - First Karatsuba decomposition turns the 20x20 mul on
134 * 13-bit words into three 10x10 muls, two on 13-bit words
135 * and one on 14-bit words.
136 *
137 * - Second Karatsuba decomposition further splits these into:
138 *
139 * * four 5x5 muls on 13-bit words
140 * * four 5x5 muls on 14-bit words
141 * * one 5x5 mul on 15-bit words
142 *
143 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
144 * or 15-bit words, respectively.
145 */
146 uint32_t u[45], v[45], w[90];
147 uint32_t cc;
148 int i;
149
150 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
151 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
152 + (s2w)[5 * (s2_off) + 0]; \
153 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
154 + (s2w)[5 * (s2_off) + 1]; \
155 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
156 + (s2w)[5 * (s2_off) + 2]; \
157 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
158 + (s2w)[5 * (s2_off) + 3]; \
159 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
160 + (s2w)[5 * (s2_off) + 4]; \
161 } while (0)
162
163 #define ZADDT(dw, d_off, sw, s_off) do { \
164 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
165 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
166 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
167 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
168 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
169 } while (0)
170
171 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
172 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
173 + (s2w)[5 * (s2_off) + 0]; \
174 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
175 + (s2w)[5 * (s2_off) + 1]; \
176 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
177 + (s2w)[5 * (s2_off) + 2]; \
178 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
179 + (s2w)[5 * (s2_off) + 3]; \
180 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
181 + (s2w)[5 * (s2_off) + 4]; \
182 } while (0)
183
184 #define CPR1(w, cprcc) do { \
185 uint32_t cprz = (w) + cprcc; \
186 (w) = cprz & 0x1FFF; \
187 cprcc = cprz >> 13; \
188 } while (0)
189
190 #define CPR(dw, d_off) do { \
191 uint32_t cprcc; \
192 cprcc = 0; \
193 CPR1((dw)[(d_off) + 0], cprcc); \
194 CPR1((dw)[(d_off) + 1], cprcc); \
195 CPR1((dw)[(d_off) + 2], cprcc); \
196 CPR1((dw)[(d_off) + 3], cprcc); \
197 CPR1((dw)[(d_off) + 4], cprcc); \
198 CPR1((dw)[(d_off) + 5], cprcc); \
199 CPR1((dw)[(d_off) + 6], cprcc); \
200 CPR1((dw)[(d_off) + 7], cprcc); \
201 CPR1((dw)[(d_off) + 8], cprcc); \
202 (dw)[(d_off) + 9] = cprcc; \
203 } while (0)
204
205 memcpy(u, a, 20 * sizeof *a);
206 ZADD(u, 4, a, 0, a, 1);
207 ZADD(u, 5, a, 2, a, 3);
208 ZADD(u, 6, a, 0, a, 2);
209 ZADD(u, 7, a, 1, a, 3);
210 ZADD(u, 8, u, 6, u, 7);
211
212 memcpy(v, b, 20 * sizeof *b);
213 ZADD(v, 4, b, 0, b, 1);
214 ZADD(v, 5, b, 2, b, 3);
215 ZADD(v, 6, b, 0, b, 2);
216 ZADD(v, 7, b, 1, b, 3);
217 ZADD(v, 8, v, 6, v, 7);
218
219 /*
220 * Do the eight first 8x8 muls. Source words are at most 16382
221 * each, so we can add product results together "as is" in 32-bit
222 * words.
223 */
224 for (i = 0; i < 40; i += 5) {
225 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
226 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
227 + MUL15(u[i + 1], v[i + 0]);
228 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
229 + MUL15(u[i + 1], v[i + 1])
230 + MUL15(u[i + 2], v[i + 0]);
231 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
232 + MUL15(u[i + 1], v[i + 2])
233 + MUL15(u[i + 2], v[i + 1])
234 + MUL15(u[i + 3], v[i + 0]);
235 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
236 + MUL15(u[i + 1], v[i + 3])
237 + MUL15(u[i + 2], v[i + 2])
238 + MUL15(u[i + 3], v[i + 1])
239 + MUL15(u[i + 4], v[i + 0]);
240 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
241 + MUL15(u[i + 2], v[i + 3])
242 + MUL15(u[i + 3], v[i + 2])
243 + MUL15(u[i + 4], v[i + 1]);
244 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
245 + MUL15(u[i + 3], v[i + 3])
246 + MUL15(u[i + 4], v[i + 2]);
247 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
248 + MUL15(u[i + 4], v[i + 3]);
249 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
250 w[(i << 1) + 9] = 0;
251 }
252
253 /*
254 * For the 9th multiplication, source words are up to 32764,
255 * so we must do some carry propagation. If we add up to
256 * 4 products and the carry is no more than 524224, then the
257 * result fits in 32 bits, and the next carry will be no more
258 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
259 *
260 * We thus just skip one of the products in the middle word,
261 * then do a carry propagation (this reduces words to 13 bits
262 * each, except possibly the last, which may use up to 17 bits
263 * or so), then add the missing product.
264 */
265 w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
266 w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
267 + MUL15(u[40 + 1], v[40 + 0]);
268 w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
269 + MUL15(u[40 + 1], v[40 + 1])
270 + MUL15(u[40 + 2], v[40 + 0]);
271 w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
272 + MUL15(u[40 + 1], v[40 + 2])
273 + MUL15(u[40 + 2], v[40 + 1])
274 + MUL15(u[40 + 3], v[40 + 0]);
275 w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
276 + MUL15(u[40 + 1], v[40 + 3])
277 + MUL15(u[40 + 2], v[40 + 2])
278 + MUL15(u[40 + 3], v[40 + 1]);
279 /* + MUL15(u[40 + 4], v[40 + 0]) */
280 w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
281 + MUL15(u[40 + 2], v[40 + 3])
282 + MUL15(u[40 + 3], v[40 + 2])
283 + MUL15(u[40 + 4], v[40 + 1]);
284 w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
285 + MUL15(u[40 + 3], v[40 + 3])
286 + MUL15(u[40 + 4], v[40 + 2]);
287 w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
288 + MUL15(u[40 + 4], v[40 + 3]);
289 w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
290
291 CPR(w, 80);
292
293 w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
294
295 /*
296 * The products on 14-bit words in slots 6 and 7 yield values
297 * up to 5*(16382^2) each, and we need to subtract two such
298 * values from the higher word. We need the subtraction to fit
299 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
300 * However, 10*(16382^2) does not fit. So we must perform a
301 * bit of reduction here.
302 */
303 CPR(w, 60);
304 CPR(w, 70);
305
306 /*
307 * Recompose results.
308 */
309
310 /* 0..1*0..1 into 0..3 */
311 ZSUB2F(w, 8, w, 0, w, 2);
312 ZSUB2F(w, 9, w, 1, w, 3);
313 ZADDT(w, 1, w, 8);
314 ZADDT(w, 2, w, 9);
315
316 /* 2..3*2..3 into 4..7 */
317 ZSUB2F(w, 10, w, 4, w, 6);
318 ZSUB2F(w, 11, w, 5, w, 7);
319 ZADDT(w, 5, w, 10);
320 ZADDT(w, 6, w, 11);
321
322 /* (0..1+2..3)*(0..1+2..3) into 12..15 */
323 ZSUB2F(w, 16, w, 12, w, 14);
324 ZSUB2F(w, 17, w, 13, w, 15);
325 ZADDT(w, 13, w, 16);
326 ZADDT(w, 14, w, 17);
327
328 /* first-level recomposition */
329 ZSUB2F(w, 12, w, 0, w, 4);
330 ZSUB2F(w, 13, w, 1, w, 5);
331 ZSUB2F(w, 14, w, 2, w, 6);
332 ZSUB2F(w, 15, w, 3, w, 7);
333 ZADDT(w, 2, w, 12);
334 ZADDT(w, 3, w, 13);
335 ZADDT(w, 4, w, 14);
336 ZADDT(w, 5, w, 15);
337
338 /*
339 * Perform carry propagation to bring all words down to 13 bits.
340 */
341 cc = norm13(d, w, 40);
342 d[39] += (cc << 13);
343
344 #undef ZADD
345 #undef ZADDT
346 #undef ZSUB2F
347 #undef CPR1
348 #undef CPR
349 }
350
351 #else
352
353 static void
354 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
355 {
356 uint32_t t[39];
357
358 t[ 0] = MUL15(a[ 0], b[ 0]);
359 t[ 1] = MUL15(a[ 0], b[ 1])
360 + MUL15(a[ 1], b[ 0]);
361 t[ 2] = MUL15(a[ 0], b[ 2])
362 + MUL15(a[ 1], b[ 1])
363 + MUL15(a[ 2], b[ 0]);
364 t[ 3] = MUL15(a[ 0], b[ 3])
365 + MUL15(a[ 1], b[ 2])
366 + MUL15(a[ 2], b[ 1])
367 + MUL15(a[ 3], b[ 0]);
368 t[ 4] = MUL15(a[ 0], b[ 4])
369 + MUL15(a[ 1], b[ 3])
370 + MUL15(a[ 2], b[ 2])
371 + MUL15(a[ 3], b[ 1])
372 + MUL15(a[ 4], b[ 0]);
373 t[ 5] = MUL15(a[ 0], b[ 5])
374 + MUL15(a[ 1], b[ 4])
375 + MUL15(a[ 2], b[ 3])
376 + MUL15(a[ 3], b[ 2])
377 + MUL15(a[ 4], b[ 1])
378 + MUL15(a[ 5], b[ 0]);
379 t[ 6] = MUL15(a[ 0], b[ 6])
380 + MUL15(a[ 1], b[ 5])
381 + MUL15(a[ 2], b[ 4])
382 + MUL15(a[ 3], b[ 3])
383 + MUL15(a[ 4], b[ 2])
384 + MUL15(a[ 5], b[ 1])
385 + MUL15(a[ 6], b[ 0]);
386 t[ 7] = MUL15(a[ 0], b[ 7])
387 + MUL15(a[ 1], b[ 6])
388 + MUL15(a[ 2], b[ 5])
389 + MUL15(a[ 3], b[ 4])
390 + MUL15(a[ 4], b[ 3])
391 + MUL15(a[ 5], b[ 2])
392 + MUL15(a[ 6], b[ 1])
393 + MUL15(a[ 7], b[ 0]);
394 t[ 8] = MUL15(a[ 0], b[ 8])
395 + MUL15(a[ 1], b[ 7])
396 + MUL15(a[ 2], b[ 6])
397 + MUL15(a[ 3], b[ 5])
398 + MUL15(a[ 4], b[ 4])
399 + MUL15(a[ 5], b[ 3])
400 + MUL15(a[ 6], b[ 2])
401 + MUL15(a[ 7], b[ 1])
402 + MUL15(a[ 8], b[ 0]);
403 t[ 9] = MUL15(a[ 0], b[ 9])
404 + MUL15(a[ 1], b[ 8])
405 + MUL15(a[ 2], b[ 7])
406 + MUL15(a[ 3], b[ 6])
407 + MUL15(a[ 4], b[ 5])
408 + MUL15(a[ 5], b[ 4])
409 + MUL15(a[ 6], b[ 3])
410 + MUL15(a[ 7], b[ 2])
411 + MUL15(a[ 8], b[ 1])
412 + MUL15(a[ 9], b[ 0]);
413 t[10] = MUL15(a[ 0], b[10])
414 + MUL15(a[ 1], b[ 9])
415 + MUL15(a[ 2], b[ 8])
416 + MUL15(a[ 3], b[ 7])
417 + MUL15(a[ 4], b[ 6])
418 + MUL15(a[ 5], b[ 5])
419 + MUL15(a[ 6], b[ 4])
420 + MUL15(a[ 7], b[ 3])
421 + MUL15(a[ 8], b[ 2])
422 + MUL15(a[ 9], b[ 1])
423 + MUL15(a[10], b[ 0]);
424 t[11] = MUL15(a[ 0], b[11])
425 + MUL15(a[ 1], b[10])
426 + MUL15(a[ 2], b[ 9])
427 + MUL15(a[ 3], b[ 8])
428 + MUL15(a[ 4], b[ 7])
429 + MUL15(a[ 5], b[ 6])
430 + MUL15(a[ 6], b[ 5])
431 + MUL15(a[ 7], b[ 4])
432 + MUL15(a[ 8], b[ 3])
433 + MUL15(a[ 9], b[ 2])
434 + MUL15(a[10], b[ 1])
435 + MUL15(a[11], b[ 0]);
436 t[12] = MUL15(a[ 0], b[12])
437 + MUL15(a[ 1], b[11])
438 + MUL15(a[ 2], b[10])
439 + MUL15(a[ 3], b[ 9])
440 + MUL15(a[ 4], b[ 8])
441 + MUL15(a[ 5], b[ 7])
442 + MUL15(a[ 6], b[ 6])
443 + MUL15(a[ 7], b[ 5])
444 + MUL15(a[ 8], b[ 4])
445 + MUL15(a[ 9], b[ 3])
446 + MUL15(a[10], b[ 2])
447 + MUL15(a[11], b[ 1])
448 + MUL15(a[12], b[ 0]);
449 t[13] = MUL15(a[ 0], b[13])
450 + MUL15(a[ 1], b[12])
451 + MUL15(a[ 2], b[11])
452 + MUL15(a[ 3], b[10])
453 + MUL15(a[ 4], b[ 9])
454 + MUL15(a[ 5], b[ 8])
455 + MUL15(a[ 6], b[ 7])
456 + MUL15(a[ 7], b[ 6])
457 + MUL15(a[ 8], b[ 5])
458 + MUL15(a[ 9], b[ 4])
459 + MUL15(a[10], b[ 3])
460 + MUL15(a[11], b[ 2])
461 + MUL15(a[12], b[ 1])
462 + MUL15(a[13], b[ 0]);
463 t[14] = MUL15(a[ 0], b[14])
464 + MUL15(a[ 1], b[13])
465 + MUL15(a[ 2], b[12])
466 + MUL15(a[ 3], b[11])
467 + MUL15(a[ 4], b[10])
468 + MUL15(a[ 5], b[ 9])
469 + MUL15(a[ 6], b[ 8])
470 + MUL15(a[ 7], b[ 7])
471 + MUL15(a[ 8], b[ 6])
472 + MUL15(a[ 9], b[ 5])
473 + MUL15(a[10], b[ 4])
474 + MUL15(a[11], b[ 3])
475 + MUL15(a[12], b[ 2])
476 + MUL15(a[13], b[ 1])
477 + MUL15(a[14], b[ 0]);
478 t[15] = MUL15(a[ 0], b[15])
479 + MUL15(a[ 1], b[14])
480 + MUL15(a[ 2], b[13])
481 + MUL15(a[ 3], b[12])
482 + MUL15(a[ 4], b[11])
483 + MUL15(a[ 5], b[10])
484 + MUL15(a[ 6], b[ 9])
485 + MUL15(a[ 7], b[ 8])
486 + MUL15(a[ 8], b[ 7])
487 + MUL15(a[ 9], b[ 6])
488 + MUL15(a[10], b[ 5])
489 + MUL15(a[11], b[ 4])
490 + MUL15(a[12], b[ 3])
491 + MUL15(a[13], b[ 2])
492 + MUL15(a[14], b[ 1])
493 + MUL15(a[15], b[ 0]);
494 t[16] = MUL15(a[ 0], b[16])
495 + MUL15(a[ 1], b[15])
496 + MUL15(a[ 2], b[14])
497 + MUL15(a[ 3], b[13])
498 + MUL15(a[ 4], b[12])
499 + MUL15(a[ 5], b[11])
500 + MUL15(a[ 6], b[10])
501 + MUL15(a[ 7], b[ 9])
502 + MUL15(a[ 8], b[ 8])
503 + MUL15(a[ 9], b[ 7])
504 + MUL15(a[10], b[ 6])
505 + MUL15(a[11], b[ 5])
506 + MUL15(a[12], b[ 4])
507 + MUL15(a[13], b[ 3])
508 + MUL15(a[14], b[ 2])
509 + MUL15(a[15], b[ 1])
510 + MUL15(a[16], b[ 0]);
511 t[17] = MUL15(a[ 0], b[17])
512 + MUL15(a[ 1], b[16])
513 + MUL15(a[ 2], b[15])
514 + MUL15(a[ 3], b[14])
515 + MUL15(a[ 4], b[13])
516 + MUL15(a[ 5], b[12])
517 + MUL15(a[ 6], b[11])
518 + MUL15(a[ 7], b[10])
519 + MUL15(a[ 8], b[ 9])
520 + MUL15(a[ 9], b[ 8])
521 + MUL15(a[10], b[ 7])
522 + MUL15(a[11], b[ 6])
523 + MUL15(a[12], b[ 5])
524 + MUL15(a[13], b[ 4])
525 + MUL15(a[14], b[ 3])
526 + MUL15(a[15], b[ 2])
527 + MUL15(a[16], b[ 1])
528 + MUL15(a[17], b[ 0]);
529 t[18] = MUL15(a[ 0], b[18])
530 + MUL15(a[ 1], b[17])
531 + MUL15(a[ 2], b[16])
532 + MUL15(a[ 3], b[15])
533 + MUL15(a[ 4], b[14])
534 + MUL15(a[ 5], b[13])
535 + MUL15(a[ 6], b[12])
536 + MUL15(a[ 7], b[11])
537 + MUL15(a[ 8], b[10])
538 + MUL15(a[ 9], b[ 9])
539 + MUL15(a[10], b[ 8])
540 + MUL15(a[11], b[ 7])
541 + MUL15(a[12], b[ 6])
542 + MUL15(a[13], b[ 5])
543 + MUL15(a[14], b[ 4])
544 + MUL15(a[15], b[ 3])
545 + MUL15(a[16], b[ 2])
546 + MUL15(a[17], b[ 1])
547 + MUL15(a[18], b[ 0]);
548 t[19] = MUL15(a[ 0], b[19])
549 + MUL15(a[ 1], b[18])
550 + MUL15(a[ 2], b[17])
551 + MUL15(a[ 3], b[16])
552 + MUL15(a[ 4], b[15])
553 + MUL15(a[ 5], b[14])
554 + MUL15(a[ 6], b[13])
555 + MUL15(a[ 7], b[12])
556 + MUL15(a[ 8], b[11])
557 + MUL15(a[ 9], b[10])
558 + MUL15(a[10], b[ 9])
559 + MUL15(a[11], b[ 8])
560 + MUL15(a[12], b[ 7])
561 + MUL15(a[13], b[ 6])
562 + MUL15(a[14], b[ 5])
563 + MUL15(a[15], b[ 4])
564 + MUL15(a[16], b[ 3])
565 + MUL15(a[17], b[ 2])
566 + MUL15(a[18], b[ 1])
567 + MUL15(a[19], b[ 0]);
568 t[20] = MUL15(a[ 1], b[19])
569 + MUL15(a[ 2], b[18])
570 + MUL15(a[ 3], b[17])
571 + MUL15(a[ 4], b[16])
572 + MUL15(a[ 5], b[15])
573 + MUL15(a[ 6], b[14])
574 + MUL15(a[ 7], b[13])
575 + MUL15(a[ 8], b[12])
576 + MUL15(a[ 9], b[11])
577 + MUL15(a[10], b[10])
578 + MUL15(a[11], b[ 9])
579 + MUL15(a[12], b[ 8])
580 + MUL15(a[13], b[ 7])
581 + MUL15(a[14], b[ 6])
582 + MUL15(a[15], b[ 5])
583 + MUL15(a[16], b[ 4])
584 + MUL15(a[17], b[ 3])
585 + MUL15(a[18], b[ 2])
586 + MUL15(a[19], b[ 1]);
587 t[21] = MUL15(a[ 2], b[19])
588 + MUL15(a[ 3], b[18])
589 + MUL15(a[ 4], b[17])
590 + MUL15(a[ 5], b[16])
591 + MUL15(a[ 6], b[15])
592 + MUL15(a[ 7], b[14])
593 + MUL15(a[ 8], b[13])
594 + MUL15(a[ 9], b[12])
595 + MUL15(a[10], b[11])
596 + MUL15(a[11], b[10])
597 + MUL15(a[12], b[ 9])
598 + MUL15(a[13], b[ 8])
599 + MUL15(a[14], b[ 7])
600 + MUL15(a[15], b[ 6])
601 + MUL15(a[16], b[ 5])
602 + MUL15(a[17], b[ 4])
603 + MUL15(a[18], b[ 3])
604 + MUL15(a[19], b[ 2]);
605 t[22] = MUL15(a[ 3], b[19])
606 + MUL15(a[ 4], b[18])
607 + MUL15(a[ 5], b[17])
608 + MUL15(a[ 6], b[16])
609 + MUL15(a[ 7], b[15])
610 + MUL15(a[ 8], b[14])
611 + MUL15(a[ 9], b[13])
612 + MUL15(a[10], b[12])
613 + MUL15(a[11], b[11])
614 + MUL15(a[12], b[10])
615 + MUL15(a[13], b[ 9])
616 + MUL15(a[14], b[ 8])
617 + MUL15(a[15], b[ 7])
618 + MUL15(a[16], b[ 6])
619 + MUL15(a[17], b[ 5])
620 + MUL15(a[18], b[ 4])
621 + MUL15(a[19], b[ 3]);
622 t[23] = MUL15(a[ 4], b[19])
623 + MUL15(a[ 5], b[18])
624 + MUL15(a[ 6], b[17])
625 + MUL15(a[ 7], b[16])
626 + MUL15(a[ 8], b[15])
627 + MUL15(a[ 9], b[14])
628 + MUL15(a[10], b[13])
629 + MUL15(a[11], b[12])
630 + MUL15(a[12], b[11])
631 + MUL15(a[13], b[10])
632 + MUL15(a[14], b[ 9])
633 + MUL15(a[15], b[ 8])
634 + MUL15(a[16], b[ 7])
635 + MUL15(a[17], b[ 6])
636 + MUL15(a[18], b[ 5])
637 + MUL15(a[19], b[ 4]);
638 t[24] = MUL15(a[ 5], b[19])
639 + MUL15(a[ 6], b[18])
640 + MUL15(a[ 7], b[17])
641 + MUL15(a[ 8], b[16])
642 + MUL15(a[ 9], b[15])
643 + MUL15(a[10], b[14])
644 + MUL15(a[11], b[13])
645 + MUL15(a[12], b[12])
646 + MUL15(a[13], b[11])
647 + MUL15(a[14], b[10])
648 + MUL15(a[15], b[ 9])
649 + MUL15(a[16], b[ 8])
650 + MUL15(a[17], b[ 7])
651 + MUL15(a[18], b[ 6])
652 + MUL15(a[19], b[ 5]);
653 t[25] = MUL15(a[ 6], b[19])
654 + MUL15(a[ 7], b[18])
655 + MUL15(a[ 8], b[17])
656 + MUL15(a[ 9], b[16])
657 + MUL15(a[10], b[15])
658 + MUL15(a[11], b[14])
659 + MUL15(a[12], b[13])
660 + MUL15(a[13], b[12])
661 + MUL15(a[14], b[11])
662 + MUL15(a[15], b[10])
663 + MUL15(a[16], b[ 9])
664 + MUL15(a[17], b[ 8])
665 + MUL15(a[18], b[ 7])
666 + MUL15(a[19], b[ 6]);
667 t[26] = MUL15(a[ 7], b[19])
668 + MUL15(a[ 8], b[18])
669 + MUL15(a[ 9], b[17])
670 + MUL15(a[10], b[16])
671 + MUL15(a[11], b[15])
672 + MUL15(a[12], b[14])
673 + MUL15(a[13], b[13])
674 + MUL15(a[14], b[12])
675 + MUL15(a[15], b[11])
676 + MUL15(a[16], b[10])
677 + MUL15(a[17], b[ 9])
678 + MUL15(a[18], b[ 8])
679 + MUL15(a[19], b[ 7]);
680 t[27] = MUL15(a[ 8], b[19])
681 + MUL15(a[ 9], b[18])
682 + MUL15(a[10], b[17])
683 + MUL15(a[11], b[16])
684 + MUL15(a[12], b[15])
685 + MUL15(a[13], b[14])
686 + MUL15(a[14], b[13])
687 + MUL15(a[15], b[12])
688 + MUL15(a[16], b[11])
689 + MUL15(a[17], b[10])
690 + MUL15(a[18], b[ 9])
691 + MUL15(a[19], b[ 8]);
692 t[28] = MUL15(a[ 9], b[19])
693 + MUL15(a[10], b[18])
694 + MUL15(a[11], b[17])
695 + MUL15(a[12], b[16])
696 + MUL15(a[13], b[15])
697 + MUL15(a[14], b[14])
698 + MUL15(a[15], b[13])
699 + MUL15(a[16], b[12])
700 + MUL15(a[17], b[11])
701 + MUL15(a[18], b[10])
702 + MUL15(a[19], b[ 9]);
703 t[29] = MUL15(a[10], b[19])
704 + MUL15(a[11], b[18])
705 + MUL15(a[12], b[17])
706 + MUL15(a[13], b[16])
707 + MUL15(a[14], b[15])
708 + MUL15(a[15], b[14])
709 + MUL15(a[16], b[13])
710 + MUL15(a[17], b[12])
711 + MUL15(a[18], b[11])
712 + MUL15(a[19], b[10]);
713 t[30] = MUL15(a[11], b[19])
714 + MUL15(a[12], b[18])
715 + MUL15(a[13], b[17])
716 + MUL15(a[14], b[16])
717 + MUL15(a[15], b[15])
718 + MUL15(a[16], b[14])
719 + MUL15(a[17], b[13])
720 + MUL15(a[18], b[12])
721 + MUL15(a[19], b[11]);
722 t[31] = MUL15(a[12], b[19])
723 + MUL15(a[13], b[18])
724 + MUL15(a[14], b[17])
725 + MUL15(a[15], b[16])
726 + MUL15(a[16], b[15])
727 + MUL15(a[17], b[14])
728 + MUL15(a[18], b[13])
729 + MUL15(a[19], b[12]);
730 t[32] = MUL15(a[13], b[19])
731 + MUL15(a[14], b[18])
732 + MUL15(a[15], b[17])
733 + MUL15(a[16], b[16])
734 + MUL15(a[17], b[15])
735 + MUL15(a[18], b[14])
736 + MUL15(a[19], b[13]);
737 t[33] = MUL15(a[14], b[19])
738 + MUL15(a[15], b[18])
739 + MUL15(a[16], b[17])
740 + MUL15(a[17], b[16])
741 + MUL15(a[18], b[15])
742 + MUL15(a[19], b[14]);
743 t[34] = MUL15(a[15], b[19])
744 + MUL15(a[16], b[18])
745 + MUL15(a[17], b[17])
746 + MUL15(a[18], b[16])
747 + MUL15(a[19], b[15]);
748 t[35] = MUL15(a[16], b[19])
749 + MUL15(a[17], b[18])
750 + MUL15(a[18], b[17])
751 + MUL15(a[19], b[16]);
752 t[36] = MUL15(a[17], b[19])
753 + MUL15(a[18], b[18])
754 + MUL15(a[19], b[17]);
755 t[37] = MUL15(a[18], b[19])
756 + MUL15(a[19], b[18]);
757 t[38] = MUL15(a[19], b[19]);
758 d[39] = norm13(d, t, 39);
759 }
760
761 #endif
762
763 /*
764 * Modulus for field F256 (field for point coordinates in curve P-256).
765 */
766 static const uint32_t F256[] = {
767 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
768 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
769 0x0000, 0x1FF8, 0x1FFF, 0x01FF
770 };
771
772 /*
773 * The 'b' curve equation coefficient for P-256.
774 */
775 static const uint32_t P256_B[] = {
776 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
777 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
778 0x0A3A, 0x0EC5, 0x118D, 0x00B5
779 };
780
781 /*
782 * Perform a "short reduction" in field F256 (field for curve P-256).
783 * The source value should be less than 262 bits; on output, it will
784 * be at most 257 bits, and less than twice the modulus.
785 */
786 static void
787 reduce_f256(uint32_t *d)
788 {
789 uint32_t x;
790
791 x = d[19] >> 9;
792 d[19] &= 0x01FF;
793 d[17] += x << 3;
794 d[14] -= x << 10;
795 d[7] -= x << 5;
796 d[0] += x;
797 norm13(d, d, 20);
798 }
799
800 /*
801 * Perform a "final reduction" in field F256 (field for curve P-256).
802 * The source value must be less than twice the modulus. If the value
803 * is not lower than the modulus, then the modulus is subtracted and
804 * this function returns 1; otherwise, it leaves it untouched and it
805 * returns 0.
806 */
807 static uint32_t
808 reduce_final_f256(uint32_t *d)
809 {
810 uint32_t t[20];
811 uint32_t cc;
812 int i;
813
814 memcpy(t, d, sizeof t);
815 cc = 0;
816 for (i = 0; i < 20; i ++) {
817 uint32_t w;
818
819 w = t[i] - F256[i] - cc;
820 cc = w >> 31;
821 t[i] = w & 0x1FFF;
822 }
823 cc ^= 1;
824 CCOPY(cc, d, t, sizeof t);
825 return cc;
826 }
827
828 /*
829 * Perform a multiplication of two integers modulo
830 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
831 * of 20 words, each containing 13 bits of data, in little-endian order.
832 * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
833 * on output, value fits on 257 bits and is lower than twice the modulus.
834 */
835 static void
836 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
837 {
838 uint32_t t[40], cc;
839 int i;
840
841 /*
842 * Compute raw multiplication. All result words fit in 13 bits
843 * each.
844 */
845 mul20(t, a, b);
846
847 /*
848 * Modular reduction: each high word in added/subtracted where
849 * necessary.
850 *
851 * The modulus is:
852 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
853 * Therefore:
854 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
855 *
856 * For a word x at bit offset n (n >= 256), we have:
857 * x*2^n = x*2^(n-32) - x*2^(n-64)
858 * - x*2^(n - 160) + x*2^(n-256) mod p
859 *
860 * Thus, we can nullify the high word if we reinject it at some
861 * proper emplacements.
862 */
863 for (i = 39; i >= 20; i --) {
864 uint32_t x;
865
866 x = t[i];
867 t[i - 2] += ARSH(x, 6);
868 t[i - 3] += (x << 7) & 0x1FFF;
869 t[i - 4] -= ARSH(x, 12);
870 t[i - 5] -= (x << 1) & 0x1FFF;
871 t[i - 12] -= ARSH(x, 4);
872 t[i - 13] -= (x << 9) & 0x1FFF;
873 t[i - 19] += ARSH(x, 9);
874 t[i - 20] += (x << 4) & 0x1FFF;
875 }
876
877 /*
878 * Propagate carries. Since the operation above really is a
879 * truncature, followed by the addition of nonnegative values,
880 * the result will be positive. Moreover, the carry cannot
881 * exceed 5 bits (we performed 20 additions with values smaller
882 * than 256 bits).
883 */
884 cc = norm13(t, t, 20);
885
886 /*
887 * Perform modular reduction again for the bits beyond 256 (the carry
888 * and the bits 256..259). This time, we can simply inject full
889 * word values.
890 */
891 cc = (cc << 4) | (t[19] >> 9);
892 t[19] &= 0x01FF;
893 t[17] += cc << 3;
894 t[14] -= cc << 10;
895 t[7] -= cc << 5;
896 t[0] += cc;
897 norm13(d, t, 20);
898 }
899
900 /*
901 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
902 * are such that:
903 * X = x / z^2
904 * Y = y / z^3
905 * For the point at infinity, z = 0.
906 * Each point thus admits many possible representations.
907 *
908 * Coordinates are represented in arrays of 32-bit integers, each holding
909 * 13 bits of data. Values may also be slightly greater than the modulus,
910 * but they will always be lower than twice the modulus.
911 */
912 typedef struct {
913 uint32_t x[20];
914 uint32_t y[20];
915 uint32_t z[20];
916 } p256_jacobian;
917
918 /*
919 * Convert a point to affine coordinates:
920 * - If the point is the point at infinity, then all three coordinates
921 * are set to 0.
922 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
923 * coordinates are the 'X' and 'Y' affine coordinates.
924 * The coordinates are guaranteed to be lower than the modulus.
925 */
926 static void
927 p256_to_affine(p256_jacobian *P)
928 {
929 uint32_t t1[20], t2[20];
930 int i;
931
932 /*
933 * Invert z with a modular exponentiation: the modulus is
934 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
935 * p-2. Exponent bit pattern (from high to low) is:
936 * - 32 bits of value 1
937 * - 31 bits of value 0
938 * - 1 bit of value 1
939 * - 96 bits of value 0
940 * - 94 bits of value 1
941 * - 1 bit of value 0
942 * - 1 bit of value 1
943 * Thus, we precompute z^(2^31-1) to speed things up.
944 *
945 * If z = 0 (point at infinity) then the modular exponentiation
946 * will yield 0, which leads to the expected result (all three
947 * coordinates set to 0).
948 */
949
950 /*
951 * A simple square-and-multiply for z^(2^31-1). We could save about
952 * two dozen multiplications here with an addition chain, but
953 * this would require a bit more code, and extra stack buffers.
954 */
955 memcpy(t1, P->z, sizeof P->z);
956 for (i = 0; i < 30; i ++) {
957 mul_f256(t1, t1, t1);
958 mul_f256(t1, t1, P->z);
959 }
960
961 /*
962 * Square-and-multiply. Apart from the squarings, we have a few
963 * multiplications to set bits to 1; we multiply by the original z
964 * for setting 1 bit, and by t1 for setting 31 bits.
965 */
966 memcpy(t2, P->z, sizeof P->z);
967 for (i = 1; i < 256; i ++) {
968 mul_f256(t2, t2, t2);
969 switch (i) {
970 case 31:
971 case 190:
972 case 221:
973 case 252:
974 mul_f256(t2, t2, t1);
975 break;
976 case 63:
977 case 253:
978 case 255:
979 mul_f256(t2, t2, P->z);
980 break;
981 }
982 }
983
984 /*
985 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
986 */
987 mul_f256(t1, t2, t2);
988 mul_f256(P->x, t1, P->x);
989 mul_f256(t1, t1, t2);
990 mul_f256(P->y, t1, P->y);
991 reduce_final_f256(P->x);
992 reduce_final_f256(P->y);
993
994 /*
995 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
996 * this will set z to 1.
997 */
998 mul_f256(P->z, P->z, t2);
999 reduce_final_f256(P->z);
1000 }
1001
1002 /*
1003 * Double a point in P-256. This function works for all valid points,
1004 * including the point at infinity.
1005 */
1006 static void
1007 p256_double(p256_jacobian *Q)
1008 {
1009 /*
1010 * Doubling formulas are:
1011 *
1012 * s = 4*x*y^2
1013 * m = 3*(x + z^2)*(x - z^2)
1014 * x' = m^2 - 2*s
1015 * y' = m*(s - x') - 8*y^4
1016 * z' = 2*y*z
1017 *
1018 * These formulas work for all points, including points of order 2
1019 * and points at infinity:
1020 * - If y = 0 then z' = 0. But there is no such point in P-256
1021 * anyway.
1022 * - If z = 0 then z' = 0.
1023 */
1024 uint32_t t1[20], t2[20], t3[20], t4[20];
1025 int i;
1026
1027 /*
1028 * Compute z^2 in t1.
1029 */
1030 mul_f256(t1, Q->z, Q->z);
1031
1032 /*
1033 * Compute x-z^2 in t2 and x+z^2 in t1.
1034 */
1035 for (i = 0; i < 20; i ++) {
1036 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1037 t1[i] += Q->x[i];
1038 }
1039 norm13(t1, t1, 20);
1040 norm13(t2, t2, 20);
1041
1042 /*
1043 * Compute 3*(x+z^2)*(x-z^2) in t1.
1044 */
1045 mul_f256(t3, t1, t2);
1046 for (i = 0; i < 20; i ++) {
1047 t1[i] = MUL15(3, t3[i]);
1048 }
1049 norm13(t1, t1, 20);
1050
1051 /*
1052 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1053 */
1054 mul_f256(t3, Q->y, Q->y);
1055 for (i = 0; i < 20; i ++) {
1056 t3[i] <<= 1;
1057 }
1058 norm13(t3, t3, 20);
1059 mul_f256(t2, Q->x, t3);
1060 for (i = 0; i < 20; i ++) {
1061 t2[i] <<= 1;
1062 }
1063 norm13(t2, t2, 20);
1064 reduce_f256(t2);
1065
1066 /*
1067 * Compute x' = m^2 - 2*s.
1068 */
1069 mul_f256(Q->x, t1, t1);
1070 for (i = 0; i < 20; i ++) {
1071 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1072 }
1073 norm13(Q->x, Q->x, 20);
1074 reduce_f256(Q->x);
1075
1076 /*
1077 * Compute z' = 2*y*z.
1078 */
1079 mul_f256(t4, Q->y, Q->z);
1080 for (i = 0; i < 20; i ++) {
1081 Q->z[i] = t4[i] << 1;
1082 }
1083 norm13(Q->z, Q->z, 20);
1084 reduce_f256(Q->z);
1085
1086 /*
1087 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1088 * 2*y^2 in t3.
1089 */
1090 for (i = 0; i < 20; i ++) {
1091 t2[i] += (F256[i] << 1) - Q->x[i];
1092 }
1093 norm13(t2, t2, 20);
1094 mul_f256(Q->y, t1, t2);
1095 mul_f256(t4, t3, t3);
1096 for (i = 0; i < 20; i ++) {
1097 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1098 }
1099 norm13(Q->y, Q->y, 20);
1100 reduce_f256(Q->y);
1101 }
1102
1103 /*
1104 * Add point P2 to point P1.
1105 *
1106 * This function computes the wrong result in the following cases:
1107 *
1108 * - If P1 == 0 but P2 != 0
1109 * - If P1 != 0 but P2 == 0
1110 * - If P1 == P2
1111 *
1112 * In all three cases, P1 is set to the point at infinity.
1113 *
1114 * Returned value is 0 if one of the following occurs:
1115 *
1116 * - P1 and P2 have the same Y coordinate
1117 * - P1 == 0 and P2 == 0
1118 * - The Y coordinate of one of the points is 0 and the other point is
1119 * the point at infinity.
1120 *
1121 * The third case cannot actually happen with valid points, since a point
1122 * with Y == 0 is a point of order 2, and there is no point of order 2 on
1123 * curve P-256.
1124 *
1125 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1126 * can apply the following:
1127 *
1128 * - If the result is not the point at infinity, then it is correct.
1129 * - Otherwise, if the returned value is 1, then this is a case of
1130 * P1+P2 == 0, so the result is indeed the point at infinity.
1131 * - Otherwise, P1 == P2, so a "double" operation should have been
1132 * performed.
1133 */
1134 static uint32_t
1135 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1136 {
1137 /*
1138 * Addtions formulas are:
1139 *
1140 * u1 = x1 * z2^2
1141 * u2 = x2 * z1^2
1142 * s1 = y1 * z2^3
1143 * s2 = y2 * z1^3
1144 * h = u2 - u1
1145 * r = s2 - s1
1146 * x3 = r^2 - h^3 - 2 * u1 * h^2
1147 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
1148 * z3 = h * z1 * z2
1149 */
1150 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1151 uint32_t ret;
1152 int i;
1153
1154 /*
1155 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1156 */
1157 mul_f256(t3, P2->z, P2->z);
1158 mul_f256(t1, P1->x, t3);
1159 mul_f256(t4, P2->z, t3);
1160 mul_f256(t3, P1->y, t4);
1161
1162 /*
1163 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1164 */
1165 mul_f256(t4, P1->z, P1->z);
1166 mul_f256(t2, P2->x, t4);
1167 mul_f256(t5, P1->z, t4);
1168 mul_f256(t4, P2->y, t5);
1169
1170 /*
1171 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1172 * We need to test whether r is zero, so we will do some extra
1173 * reduce.
1174 */
1175 for (i = 0; i < 20; i ++) {
1176 t2[i] += (F256[i] << 1) - t1[i];
1177 t4[i] += (F256[i] << 1) - t3[i];
1178 }
1179 norm13(t2, t2, 20);
1180 norm13(t4, t4, 20);
1181 reduce_f256(t4);
1182 reduce_final_f256(t4);
1183 ret = 0;
1184 for (i = 0; i < 20; i ++) {
1185 ret |= t4[i];
1186 }
1187 ret = (ret | -ret) >> 31;
1188
1189 /*
1190 * Compute u1*h^2 (in t6) and h^3 (in t5);
1191 */
1192 mul_f256(t7, t2, t2);
1193 mul_f256(t6, t1, t7);
1194 mul_f256(t5, t7, t2);
1195
1196 /*
1197 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1198 */
1199 mul_f256(P1->x, t4, t4);
1200 for (i = 0; i < 20; i ++) {
1201 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1202 }
1203 norm13(P1->x, P1->x, 20);
1204 reduce_f256(P1->x);
1205
1206 /*
1207 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1208 */
1209 for (i = 0; i < 20; i ++) {
1210 t6[i] += (F256[i] << 1) - P1->x[i];
1211 }
1212 norm13(t6, t6, 20);
1213 mul_f256(P1->y, t4, t6);
1214 mul_f256(t1, t5, t3);
1215 for (i = 0; i < 20; i ++) {
1216 P1->y[i] += (F256[i] << 1) - t1[i];
1217 }
1218 norm13(P1->y, P1->y, 20);
1219 reduce_f256(P1->y);
1220
1221 /*
1222 * Compute z3 = h*z1*z2.
1223 */
1224 mul_f256(t1, P1->z, P2->z);
1225 mul_f256(P1->z, t1, t2);
1226
1227 return ret;
1228 }
1229
1230 /*
1231 * Decode a P-256 point. This function does not support the point at
1232 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1233 */
1234 static uint32_t
1235 p256_decode(p256_jacobian *P, const void *src, size_t len)
1236 {
1237 const unsigned char *buf;
1238 uint32_t tx[20], ty[20], t1[20], t2[20];
1239 uint32_t bad;
1240 int i;
1241
1242 if (len != 65) {
1243 return 0;
1244 }
1245 buf = src;
1246
1247 /*
1248 * First byte must be 0x04 (uncompressed format). We could support
1249 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1250 * least significant bit of the Y coordinate), but it is explicitly
1251 * forbidden by RFC 5480 (section 2.2).
1252 */
1253 bad = NEQ(buf[0], 0x04);
1254
1255 /*
1256 * Decode the coordinates, and check that they are both lower
1257 * than the modulus.
1258 */
1259 tx[19] = be8_to_le13(tx, buf + 1, 32);
1260 ty[19] = be8_to_le13(ty, buf + 33, 32);
1261 bad |= reduce_final_f256(tx);
1262 bad |= reduce_final_f256(ty);
1263
1264 /*
1265 * Check curve equation.
1266 */
1267 mul_f256(t1, tx, tx);
1268 mul_f256(t1, tx, t1);
1269 mul_f256(t2, ty, ty);
1270 for (i = 0; i < 20; i ++) {
1271 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1272 }
1273 norm13(t1, t1, 20);
1274 reduce_f256(t1);
1275 reduce_final_f256(t1);
1276 for (i = 0; i < 20; i ++) {
1277 bad |= t1[i];
1278 }
1279
1280 /*
1281 * Copy coordinates to the point structure.
1282 */
1283 memcpy(P->x, tx, sizeof tx);
1284 memcpy(P->y, ty, sizeof ty);
1285 memset(P->z, 0, sizeof P->z);
1286 P->z[0] = 1;
1287 return NEQ(bad, 0) ^ 1;
1288 }
1289
1290 /*
1291 * Encode a point into a buffer. This function assumes that the point is
1292 * valid, in affine coordinates, and not the point at infinity.
1293 */
1294 static void
1295 p256_encode(void *dst, const p256_jacobian *P)
1296 {
1297 unsigned char *buf;
1298
1299 buf = dst;
1300 buf[0] = 0x04;
1301 le13_to_be8(buf + 1, 32, P->x);
1302 le13_to_be8(buf + 33, 32, P->y);
1303 }
1304
1305 /*
1306 * Multiply a curve point by an integer. The integer is assumed to be
1307 * lower than the curve order, and the base point must not be the point
1308 * at infinity.
1309 */
1310 static void
1311 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1312 {
1313 /*
1314 * qz is a flag that is initially 1, and remains equal to 1
1315 * as long as the point is the point at infinity.
1316 *
1317 * We use a 2-bit window to handle multiplier bits by pairs.
1318 * The precomputed window really is the points P2 and P3.
1319 */
1320 uint32_t qz;
1321 p256_jacobian P2, P3, Q, T, U;
1322
1323 /*
1324 * Compute window values.
1325 */
1326 P2 = *P;
1327 p256_double(&P2);
1328 P3 = *P;
1329 p256_add(&P3, &P2);
1330
1331 /*
1332 * We start with Q = 0. We process multiplier bits 2 by 2.
1333 */
1334 memset(&Q, 0, sizeof Q);
1335 qz = 1;
1336 while (xlen -- > 0) {
1337 int k;
1338
1339 for (k = 6; k >= 0; k -= 2) {
1340 uint32_t bits;
1341 uint32_t bnz;
1342
1343 p256_double(&Q);
1344 p256_double(&Q);
1345 T = *P;
1346 U = Q;
1347 bits = (*x >> k) & (uint32_t)3;
1348 bnz = NEQ(bits, 0);
1349 CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1350 CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1351 p256_add(&U, &T);
1352 CCOPY(bnz & qz, &Q, &T, sizeof Q);
1353 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1354 qz &= ~bnz;
1355 }
1356 x ++;
1357 }
1358 *P = Q;
1359 }
1360
1361 static const unsigned char P256_G[] = {
1362 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1363 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1364 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1365 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1366 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1367 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1368 0x68, 0x37, 0xBF, 0x51, 0xF5
1369 };
1370
1371 static const unsigned char P256_N[] = {
1372 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1373 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1374 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1375 0x25, 0x51
1376 };
1377
1378 static const unsigned char *
1379 api_generator(int curve, size_t *len)
1380 {
1381 (void)curve;
1382 *len = sizeof P256_G;
1383 return P256_G;
1384 }
1385
1386 static const unsigned char *
1387 api_order(int curve, size_t *len)
1388 {
1389 (void)curve;
1390 *len = sizeof P256_N;
1391 return P256_N;
1392 }
1393
1394 static uint32_t
1395 api_mul(unsigned char *G, size_t Glen,
1396 const unsigned char *x, size_t xlen, int curve)
1397 {
1398 uint32_t r;
1399 p256_jacobian P;
1400
1401 (void)curve;
1402 r = p256_decode(&P, G, Glen);
1403 p256_mul(&P, x, xlen);
1404 if (Glen >= 65) {
1405 p256_to_affine(&P);
1406 p256_encode(G, &P);
1407 }
1408 return r;
1409 }
1410
1411 static uint32_t
1412 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1413 const unsigned char *x, size_t xlen,
1414 const unsigned char *y, size_t ylen, int curve)
1415 {
1416 p256_jacobian P, Q;
1417 uint32_t r, t, z;
1418 int i;
1419
1420 (void)curve;
1421 r = p256_decode(&P, A, len);
1422 r &= p256_decode(&Q, B, len);
1423 p256_mul(&P, x, xlen);
1424 p256_mul(&Q, y, ylen);
1425
1426 /*
1427 * The final addition may fail in case both points are equal.
1428 */
1429 t = p256_add(&P, &Q);
1430 reduce_final_f256(P.z);
1431 z = 0;
1432 for (i = 0; i < 20; i ++) {
1433 z |= P.z[i];
1434 }
1435 z = EQ(z, 0);
1436 p256_double(&Q);
1437
1438 /*
1439 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1440 * have the following:
1441 *
1442 * z = 0, t = 0 return P (normal addition)
1443 * z = 0, t = 1 return P (normal addition)
1444 * z = 1, t = 0 return Q (a 'double' case)
1445 * z = 1, t = 1 report an error (P+Q = 0)
1446 */
1447 CCOPY(z & ~t, &P, &Q, sizeof Q);
1448 p256_to_affine(&P);
1449 p256_encode(A, &P);
1450 r &= ~(z & t);
1451 return r;
1452 }
1453
1454 /* see bearssl_ec.h */
1455 const br_ec_impl br_ec_p256_i15 = {
1456 (uint32_t)0x00800000,
1457 &api_generator,
1458 &api_order,
1459 &api_mul,
1460 &api_muladd
1461 };