37dd4a3a90ca9b3566fecd4e4688fc7d7b64ac27
[BearSSL] / src / ec / ec_p256_i15.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29 * that right-shifting a signed negative integer copies the sign bit
30 * (arithmetic right-shift). This is "implementation-defined behaviour",
31 * i.e. it is not undefined, but it may differ between compilers. Each
32 * compiler is supposed to document its behaviour in that respect. GCC
33 * explicitly defines that an arithmetic right shift is used. We expect
34 * all other compilers to do the same, because underlying CPU offer an
35 * arithmetic right shift opcode that could not be used otherwise.
36 */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
39 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #else
41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
42 #endif
43
44 /*
45 * Convert an integer from unsigned big-endian encoding to a sequence of
46 * 13-bit words in little-endian order. The final "partial" word is
47 * returned.
48 */
49 static uint32_t
50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51 {
52 uint32_t acc;
53 int acc_len;
54
55 acc = 0;
56 acc_len = 0;
57 while (len -- > 0) {
58 acc |= (uint32_t)src[len] << acc_len;
59 acc_len += 8;
60 if (acc_len >= 13) {
61 *dst ++ = acc & 0x1FFF;
62 acc >>= 13;
63 acc_len -= 13;
64 }
65 }
66 return acc;
67 }
68
69 /*
70 * Convert an integer (13-bit words, little-endian) to unsigned
71 * big-endian encoding. The total encoding length is provided; all
72 * the destination bytes will be filled.
73 */
74 static void
75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76 {
77 uint32_t acc;
78 int acc_len;
79
80 acc = 0;
81 acc_len = 0;
82 while (len -- > 0) {
83 if (acc_len < 8) {
84 acc |= (*src ++) << acc_len;
85 acc_len += 13;
86 }
87 dst[len] = (unsigned char)acc;
88 acc >>= 8;
89 acc_len -= 8;
90 }
91 }
92
93 /*
94 * Multiply two 4-word integers. Basis is 2^13 = 8192. Individual words
95 * may have value up to 32766. Result integer consists in eight words;
96 * each word is in the 0..8191 range, except the last one (d[7]) which
97 * gathers the excess bits.
98 *
99 * Max value for d[7], depending on max word value:
100 *
101 * 32764: 131071
102 * 32765: 131080
103 * 32766: 131088
104 */
105 static inline void
106 mul4(uint32_t *d, const uint32_t *a, const uint32_t *b)
107 {
108 uint32_t t0, t1, t2, t3, t4, t5, t6;
109
110 t0 = MUL15(a[0], b[0]);
111 t1 = MUL15(a[0], b[1]) + MUL15(a[1], b[0]);
112 t2 = MUL15(a[0], b[2]) + MUL15(a[1], b[1]) + MUL15(a[2], b[0]);
113 t3 = MUL15(a[0], b[3]) + MUL15(a[1], b[2])
114 + MUL15(a[2], b[1]) + MUL15(a[3], b[0]);
115 t4 = MUL15(a[1], b[3]) + MUL15(a[2], b[2]) + MUL15(a[3], b[1]);
116 t5 = MUL15(a[2], b[3]) + MUL15(a[3], b[2]);
117 t6 = MUL15(a[3], b[3]);
118
119 /*
120 * Maximum value is obtained when adding the carry to t3, when all
121 * input words have maximum value. When the maximum is 32766,
122 * the addition of t3 with the carry (t2 >> 13) yields 4294836223,
123 * which is still below 2^32 = 4294967296.
124 */
125
126 d[0] = t0 & 0x1FFF;
127 t1 += t0 >> 13;
128 d[1] = t1 & 0x1FFF;
129 t2 += t1 >> 13;
130 d[2] = t2 & 0x1FFF;
131 t3 += t2 >> 13;
132 d[3] = t3 & 0x1FFF;
133 t4 += t3 >> 13;
134 d[4] = t4 & 0x1FFF;
135 t5 += t4 >> 13;
136 d[5] = t5 & 0x1FFF;
137 t6 += t5 >> 13;
138 d[6] = t6 & 0x1FFF;
139 d[7] = t6 >> 13;
140 }
141
142 /*
143 * Normalise an array of words to a strict 13 bits per word. Returned
144 * value is the resulting carry. The source (w) and destination (d)
145 * arrays may be identical, but shall not overlap partially.
146 */
147 static uint32_t
148 norm13(uint32_t *d, const uint32_t *w, size_t len)
149 {
150 size_t u;
151 uint32_t cc;
152
153 cc = 0;
154 for (u = 0; u < len; u ++) {
155 int32_t z;
156
157 z = w[u] + cc;
158 d[u] = z & 0x1FFF;
159 cc = ARSH(z, 13);
160 }
161 return cc;
162 }
163
164 /*
165 * Multiply two 20-word values together, result over 40 words. Each word
166 * contains 13 bits of data; the top words (a[19] and b[19]) may contain
167 * up to 15 bits each. Therefore this routine handles integer up to 2^262-1.
168 * All words in the output have length 13 bits, except possibly the top
169 * one, in case the sources had excess bits.
170 */
171 static void
172 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
173 {
174 /*
175 * We first split the 20-word values into a 16-word value and
176 * a 4-word value. The 16x16 product uses two levels of Karatsuba
177 * decomposition. The preparatory additions are done word-wise
178 * without carry propagation; since the input words have size
179 * at most 13 bits, the sums may be up to 4*8191 = 32764, which
180 * is in the range supported by mul4().
181 */
182 uint32_t u[72], v[72], w[144];
183 uint32_t cc;
184 int i;
185
186 /*
187 * Karatsuba decomposition, two levels.
188 *
189 * off src
190 * 0..7 * 0..7
191 * 0 0..3 * 0..3
192 * 4 4..7 * 4..7
193 * 16 0..3+4..7 * 0..3+4..7
194 *
195 * 8..15 * 8..15
196 * 8 8..11 * 8..11
197 * 12 12..15 * 12..15
198 * 20 8..11+12..15 * 8..11+12..15
199 *
200 * 0..7+8..15 * 0..7+8..15
201 * 24 0..3+8..11 * 0..3+8..11
202 * 28 4..7+12..15 * 4..7+12..15
203 * 32 0..3+4..7+8..11+12..15 * 0..3+4..7+8..11+12..15
204 */
205
206 #define WADD(z, x, y) do { \
207 u[(z) + 0] = u[(x) + 0] + u[y + 0]; \
208 u[(z) + 1] = u[(x) + 1] + u[y + 1]; \
209 u[(z) + 2] = u[(x) + 2] + u[y + 2]; \
210 u[(z) + 3] = u[(x) + 3] + u[y + 3]; \
211 v[(z) + 0] = v[(x) + 0] + v[y + 0]; \
212 v[(z) + 1] = v[(x) + 1] + v[y + 1]; \
213 v[(z) + 2] = v[(x) + 2] + v[y + 2]; \
214 v[(z) + 3] = v[(x) + 3] + v[y + 3]; \
215 } while (0)
216
217 memcpy(u, a, 16 * sizeof *a);
218 memcpy(v, b, 16 * sizeof *b);
219 WADD(16, 0, 4);
220 WADD(20, 8, 12);
221 WADD(24, 0, 8);
222 WADD(28, 4, 12);
223 WADD(32, 16, 20);
224 memcpy(u + 36, a, 16 * sizeof *a);
225 memcpy(v + 52, b, 16 * sizeof *b);
226 for (i = 0; i < 4; i ++) {
227 memcpy(u + 52 + (i << 2), a + 16, 4 * sizeof *a);
228 memcpy(v + 36 + (i << 2), b + 16, 4 * sizeof *b);
229 }
230 memcpy(u + 68, a + 16, 4 * sizeof *a);
231 memcpy(v + 68, b + 16, 4 * sizeof *b);
232
233 #undef WADD
234
235 /*
236 * Perform the elementary 4x4 multiplications:
237 * 16x16: 9 multiplications (Karatsuba)
238 * 16x4 (1): 4 multiplications
239 * 16x4 (2): 4 multiplications
240 * 4x4: 1 multiplication
241 */
242 for (i = 0; i < 18; i ++) {
243 mul4(w + (i << 3), u + (i << 2), v + (i << 2));
244 }
245
246 /*
247 * mul4 cross-product adjustments:
248 * subtract 0 and 8 from 32 (8 words)
249 * subtract 16 and 24 from 40 (8 words)
250 * subtract 48 and 56 from 64 (8 words)
251 */
252 for (i = 0; i < 8; i ++) {
253 w[i + 32] -= w[i + 0] + w[i + 8];
254 w[i + 40] -= w[i + 16] + w[i + 24];
255 w[i + 64] -= w[i + 48] + w[i + 56];
256 }
257
258 /*
259 * complete the three 8x8 products:
260 * add 32 to 4 (8 words)
261 * add 40 to 20 (8 words)
262 * add 64 to 52 (8 words)
263 */
264 for (i = 0; i < 8; i ++) {
265 w[i + 4] += w[i + 32];
266 w[i + 20] += w[i + 40];
267 w[i + 52] += w[i + 64];
268 }
269
270 /*
271 * Adjust the 16x16 product:
272 * subtract 0 from 48 (16 words)
273 * subtract 16 from 48 (16 words)
274 * add 48 to 8 (16 words)
275 */
276 for (i = 0; i < 16; i ++) {
277 w[i + 48] -= w[i + 0];
278 w[i + 48] -= w[i + 16];
279 }
280 for (i = 0; i < 16; i ++) {
281 w[i + 8] += w[i + 48];
282 }
283
284 /*
285 * At that point, the product of the low chunks (0..15 * 0..15)
286 * is in words 0..31. We must add the three other partial products,
287 * which begin at word 72 in w[]. Words 32 to 39 are first set to
288 * the product of the high chunks (16..19 * 16..19), then the
289 * low-high cross products are added in.
290 */
291 memcpy(w + 32, w + 136, 8 * sizeof w[0]);
292 for (i = 0; i < 8; i ++) {
293 w[i + 16] += w[i + 72] + w[i + 104];
294 w[i + 20] += w[i + 80] + w[i + 112];
295 w[i + 24] += w[i + 88] + w[i + 120];
296 w[i + 28] += w[i + 96] + w[i + 128];
297 }
298
299 /*
300 * We did all the additions and subtractions in a word-wise way,
301 * which is fine since we have plenty of extra bits for carries.
302 * We must now do the carry propagation.
303 */
304 cc = norm13(d, w, 40);
305 d[39] += (cc << 13);
306 }
307
308 /*
309 * Modulus for field F256 (field for point coordinates in curve P-256).
310 */
311 static const uint32_t F256[] = {
312 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
313 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
314 0x0000, 0x1FF8, 0x1FFF, 0x01FF
315 };
316
317 /*
318 * The 'b' curve equation coefficient for P-256.
319 */
320 static const uint32_t P256_B[] = {
321 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
322 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
323 0x0A3A, 0x0EC5, 0x118D, 0x00B5
324 };
325
326 /*
327 * Perform a "short reduction" in field F256 (field for curve P-256).
328 * The source value should be less than 262 bits; on output, it will
329 * be at most 257 bits, and less than twice the modulus.
330 */
331 static void
332 reduce_f256(uint32_t *d)
333 {
334 uint32_t x;
335
336 x = d[19] >> 9;
337 d[19] &= 0x01FF;
338 d[17] += x << 3;
339 d[14] -= x << 10;
340 d[7] -= x << 5;
341 d[0] += x;
342 norm13(d, d, 20);
343 }
344
345 /*
346 * Perform a "final reduction" in field F256 (field for curve P-256).
347 * The source value must be less than twice the modulus. If the value
348 * is not lower than the modulus, then the modulus is subtracted and
349 * this function returns 1; otherwise, it leaves it untouched and it
350 * returns 0.
351 */
352 static uint32_t
353 reduce_final_f256(uint32_t *d)
354 {
355 uint32_t t[20];
356 uint32_t cc;
357 int i;
358
359 memcpy(t, d, sizeof t);
360 cc = 0;
361 for (i = 0; i < 20; i ++) {
362 uint32_t w;
363
364 w = t[i] - F256[i] - cc;
365 cc = w >> 31;
366 t[i] = w & 0x1FFF;
367 }
368 cc ^= 1;
369 CCOPY(cc, d, t, sizeof t);
370 return cc;
371 }
372
373 /*
374 * Perform a multiplication of two integers modulo
375 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
376 * of 20 words, each containing 13 bits of data, in little-endian order.
377 * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
378 * on output, value fits on 257 bits and is lower than twice the modulus.
379 */
380 static void
381 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
382 {
383 uint32_t t[40], cc;
384 int i;
385
386 /*
387 * Compute raw multiplication. All result words fit in 13 bits
388 * each.
389 */
390 mul20(t, a, b);
391
392 /*
393 * Modular reduction: each high word in added/subtracted where
394 * necessary.
395 *
396 * The modulus is:
397 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
398 * Therefore:
399 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
400 *
401 * For a word x at bit offset n (n >= 256), we have:
402 * x*2^n = x*2^(n-32) - x*2^(n-64)
403 * - x*2^(n - 160) + x*2^(n-256) mod p
404 *
405 * Thus, we can nullify the high word if we reinject it at some
406 * proper emplacements.
407 */
408 for (i = 39; i >= 20; i --) {
409 uint32_t x;
410
411 x = t[i];
412 t[i - 2] += ARSH(x, 6);
413 t[i - 3] += (x << 7) & 0x1FFF;
414 t[i - 4] -= ARSH(x, 12);
415 t[i - 5] -= (x << 1) & 0x1FFF;
416 t[i - 12] -= ARSH(x, 4);
417 t[i - 13] -= (x << 9) & 0x1FFF;
418 t[i - 19] += ARSH(x, 9);
419 t[i - 20] += (x << 4) & 0x1FFF;
420 }
421
422 /*
423 * Propagate carries. Since the operation above really is a
424 * truncature, followed by the addition of nonnegative values,
425 * the result will be positive. Moreover, the carry cannot
426 * exceed 5 bits (we performed 20 additions with values smaller
427 * than 256 bits).
428 */
429 cc = norm13(t, t, 20);
430
431 /*
432 * Perform modular reduction again for the bits beyond 256 (the carry
433 * and the bits 256..259). This time, we can simply inject full
434 * word values.
435 */
436 cc = (cc << 4) | (t[19] >> 9);
437 t[19] &= 0x01FF;
438 t[17] += cc << 3;
439 t[14] -= cc << 10;
440 t[7] -= cc << 5;
441 t[0] += cc;
442 norm13(d, t, 20);
443 }
444
445 /*
446 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
447 * are such that:
448 * X = x / z^2
449 * Y = y / z^3
450 * For the point at infinity, z = 0.
451 * Each point thus admits many possible representations.
452 *
453 * Coordinates are represented in arrays of 32-bit integers, each holding
454 * 13 bits of data. Values may also be slightly greater than the modulus,
455 * but they will always be lower than twice the modulus.
456 */
457 typedef struct {
458 uint32_t x[20];
459 uint32_t y[20];
460 uint32_t z[20];
461 } p256_jacobian;
462
463 /*
464 * Convert a point to affine coordinates:
465 * - If the point is the point at infinity, then all three coordinates
466 * are set to 0.
467 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
468 * coordinates are the 'X' and 'Y' affine coordinates.
469 * The coordinates are guaranteed to be lower than the modulus.
470 */
471 static void
472 p256_to_affine(p256_jacobian *P)
473 {
474 uint32_t t1[20], t2[20];
475 int i;
476
477 /*
478 * Invert z with a modular exponentiation: the modulus is
479 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
480 * p-2. Exponent bit pattern (from high to low) is:
481 * - 32 bits of value 1
482 * - 31 bits of value 0
483 * - 1 bit of value 1
484 * - 96 bits of value 0
485 * - 94 bits of value 1
486 * - 1 bit of value 0
487 * - 1 bit of value 1
488 * Thus, we precompute z^(2^31-1) to speed things up.
489 *
490 * If z = 0 (point at infinity) then the modular exponentiation
491 * will yield 0, which leads to the expected result (all three
492 * coordinates set to 0).
493 */
494
495 /*
496 * A simple square-and-multiply for z^(2^31-1). We could save about
497 * two dozen multiplications here with an addition chain, but
498 * this would require a bit more code, and extra stack buffers.
499 */
500 memcpy(t1, P->z, sizeof P->z);
501 for (i = 0; i < 30; i ++) {
502 mul_f256(t1, t1, t1);
503 mul_f256(t1, t1, P->z);
504 }
505
506 /*
507 * Square-and-multiply. Apart from the squarings, we have a few
508 * multiplications to set bits to 1; we multiply by the original z
509 * for setting 1 bit, and by t1 for setting 31 bits.
510 */
511 memcpy(t2, P->z, sizeof P->z);
512 for (i = 1; i < 256; i ++) {
513 mul_f256(t2, t2, t2);
514 switch (i) {
515 case 31:
516 case 190:
517 case 221:
518 case 252:
519 mul_f256(t2, t2, t1);
520 break;
521 case 63:
522 case 253:
523 case 255:
524 mul_f256(t2, t2, P->z);
525 break;
526 }
527 }
528
529 /*
530 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
531 */
532 mul_f256(t1, t2, t2);
533 mul_f256(P->x, t1, P->x);
534 mul_f256(t1, t1, t2);
535 mul_f256(P->y, t1, P->y);
536 reduce_final_f256(P->x);
537 reduce_final_f256(P->y);
538
539 /*
540 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
541 * this will set z to 1.
542 */
543 mul_f256(P->z, P->z, t2);
544 reduce_final_f256(P->z);
545 }
546
547 /*
548 * Double a point in P-256. This function works for all valid points,
549 * including the point at infinity.
550 */
551 static void
552 p256_double(p256_jacobian *Q)
553 {
554 /*
555 * Doubling formulas are:
556 *
557 * s = 4*x*y^2
558 * m = 3*(x + z^2)*(x - z^2)
559 * x' = m^2 - 2*s
560 * y' = m*(s - x') - 8*y^4
561 * z' = 2*y*z
562 *
563 * These formulas work for all points, including points of order 2
564 * and points at infinity:
565 * - If y = 0 then z' = 0. But there is no such point in P-256
566 * anyway.
567 * - If z = 0 then z' = 0.
568 */
569 uint32_t t1[20], t2[20], t3[20], t4[20];
570 int i;
571
572 /*
573 * Compute z^2 in t1.
574 */
575 mul_f256(t1, Q->z, Q->z);
576
577 /*
578 * Compute x-z^2 in t2 and x+z^2 in t1.
579 */
580 for (i = 0; i < 20; i ++) {
581 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
582 t1[i] += Q->x[i];
583 }
584 norm13(t1, t1, 20);
585 norm13(t2, t2, 20);
586
587 /*
588 * Compute 3*(x+z^2)*(x-z^2) in t1.
589 */
590 mul_f256(t3, t1, t2);
591 for (i = 0; i < 20; i ++) {
592 t1[i] = MUL15(3, t3[i]);
593 }
594 norm13(t1, t1, 20);
595
596 /*
597 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
598 */
599 mul_f256(t3, Q->y, Q->y);
600 for (i = 0; i < 20; i ++) {
601 t3[i] <<= 1;
602 }
603 norm13(t3, t3, 20);
604 mul_f256(t2, Q->x, t3);
605 for (i = 0; i < 20; i ++) {
606 t2[i] <<= 1;
607 }
608 norm13(t2, t2, 20);
609 reduce_f256(t2);
610
611 /*
612 * Compute x' = m^2 - 2*s.
613 */
614 mul_f256(Q->x, t1, t1);
615 for (i = 0; i < 20; i ++) {
616 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
617 }
618 norm13(Q->x, Q->x, 20);
619 reduce_f256(Q->x);
620
621 /*
622 * Compute z' = 2*y*z.
623 */
624 mul_f256(t4, Q->y, Q->z);
625 for (i = 0; i < 20; i ++) {
626 Q->z[i] = t4[i] << 1;
627 }
628 norm13(Q->z, Q->z, 20);
629 reduce_f256(Q->z);
630
631 /*
632 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
633 * 2*y^2 in t3.
634 */
635 for (i = 0; i < 20; i ++) {
636 t2[i] += (F256[i] << 1) - Q->x[i];
637 }
638 norm13(t2, t2, 20);
639 mul_f256(Q->y, t1, t2);
640 mul_f256(t4, t3, t3);
641 for (i = 0; i < 20; i ++) {
642 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
643 }
644 norm13(Q->y, Q->y, 20);
645 reduce_f256(Q->y);
646 }
647
648 /*
649 * Add point P2 to point P1.
650 *
651 * This function computes the wrong result in the following cases:
652 *
653 * - If P1 == 0 but P2 != 0
654 * - If P1 != 0 but P2 == 0
655 * - If P1 == P2
656 *
657 * In all three cases, P1 is set to the point at infinity.
658 *
659 * Returned value is 0 if one of the following occurs:
660 *
661 * - P1 and P2 have the same Y coordinate
662 * - P1 == 0 and P2 == 0
663 * - The Y coordinate of one of the points is 0 and the other point is
664 * the point at infinity.
665 *
666 * The third case cannot actually happen with valid points, since a point
667 * with Y == 0 is a point of order 2, and there is no point of order 2 on
668 * curve P-256.
669 *
670 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
671 * can apply the following:
672 *
673 * - If the result is not the point at infinity, then it is correct.
674 * - Otherwise, if the returned value is 1, then this is a case of
675 * P1+P2 == 0, so the result is indeed the point at infinity.
676 * - Otherwise, P1 == P2, so a "double" operation should have been
677 * performed.
678 */
679 static uint32_t
680 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
681 {
682 /*
683 * Addtions formulas are:
684 *
685 * u1 = x1 * z2^2
686 * u2 = x2 * z1^2
687 * s1 = y1 * z2^3
688 * s2 = y2 * z1^3
689 * h = u2 - u1
690 * r = s2 - s1
691 * x3 = r^2 - h^3 - 2 * u1 * h^2
692 * y3 = r * (u1 * h^2 - x3) - s1 * h^3
693 * z3 = h * z1 * z2
694 */
695 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
696 uint32_t ret;
697 int i;
698
699 /*
700 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
701 */
702 mul_f256(t3, P2->z, P2->z);
703 mul_f256(t1, P1->x, t3);
704 mul_f256(t4, P2->z, t3);
705 mul_f256(t3, P1->y, t4);
706
707 /*
708 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
709 */
710 mul_f256(t4, P1->z, P1->z);
711 mul_f256(t2, P2->x, t4);
712 mul_f256(t5, P1->z, t4);
713 mul_f256(t4, P2->y, t5);
714
715 /*
716 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
717 * We need to test whether r is zero, so we will do some extra
718 * reduce.
719 */
720 for (i = 0; i < 20; i ++) {
721 t2[i] += (F256[i] << 1) - t1[i];
722 t4[i] += (F256[i] << 1) - t3[i];
723 }
724 norm13(t2, t2, 20);
725 norm13(t4, t4, 20);
726 reduce_f256(t4);
727 reduce_final_f256(t4);
728 ret = 0;
729 for (i = 0; i < 20; i ++) {
730 ret |= t4[i];
731 }
732 ret = (ret | -ret) >> 31;
733
734 /*
735 * Compute u1*h^2 (in t6) and h^3 (in t5);
736 */
737 mul_f256(t7, t2, t2);
738 mul_f256(t6, t1, t7);
739 mul_f256(t5, t7, t2);
740
741 /*
742 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
743 */
744 mul_f256(P1->x, t4, t4);
745 for (i = 0; i < 20; i ++) {
746 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
747 }
748 norm13(P1->x, P1->x, 20);
749 reduce_f256(P1->x);
750
751 /*
752 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
753 */
754 for (i = 0; i < 20; i ++) {
755 t6[i] += (F256[i] << 1) - P1->x[i];
756 }
757 norm13(t6, t6, 20);
758 mul_f256(P1->y, t4, t6);
759 mul_f256(t1, t5, t3);
760 for (i = 0; i < 20; i ++) {
761 P1->y[i] += (F256[i] << 1) - t1[i];
762 }
763 norm13(P1->y, P1->y, 20);
764 reduce_f256(P1->y);
765
766 /*
767 * Compute z3 = h*z1*z2.
768 */
769 mul_f256(t1, P1->z, P2->z);
770 mul_f256(P1->z, t1, t2);
771
772 return ret;
773 }
774
775 /*
776 * Decode a P-256 point. This function does not support the point at
777 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
778 */
779 static uint32_t
780 p256_decode(p256_jacobian *P, const void *src, size_t len)
781 {
782 const unsigned char *buf;
783 uint32_t tx[20], ty[20], t1[20], t2[20];
784 uint32_t bad;
785 int i;
786
787 if (len != 65) {
788 return 0;
789 }
790 buf = src;
791
792 /*
793 * First byte must be 0x04 (uncompressed format). We could support
794 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
795 * least significant bit of the Y coordinate), but it is explicitly
796 * forbidden by RFC 5480 (section 2.2).
797 */
798 bad = NEQ(buf[0], 0x04);
799
800 /*
801 * Decode the coordinates, and check that they are both lower
802 * than the modulus.
803 */
804 tx[19] = be8_to_le13(tx, buf + 1, 32);
805 ty[19] = be8_to_le13(ty, buf + 33, 32);
806 bad |= reduce_final_f256(tx);
807 bad |= reduce_final_f256(ty);
808
809 /*
810 * Check curve equation.
811 */
812 mul_f256(t1, tx, tx);
813 mul_f256(t1, tx, t1);
814 mul_f256(t2, ty, ty);
815 for (i = 0; i < 20; i ++) {
816 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
817 }
818 norm13(t1, t1, 20);
819 reduce_f256(t1);
820 reduce_final_f256(t1);
821 for (i = 0; i < 20; i ++) {
822 bad |= t1[i];
823 }
824
825 /*
826 * Copy coordinates to the point structure.
827 */
828 memcpy(P->x, tx, sizeof tx);
829 memcpy(P->y, ty, sizeof ty);
830 memset(P->z, 0, sizeof P->z);
831 P->z[0] = 1;
832 return NEQ(bad, 0) ^ 1;
833 }
834
835 /*
836 * Encode a point into a buffer. This function assumes that the point is
837 * valid, in affine coordinates, and not the point at infinity.
838 */
839 static void
840 p256_encode(void *dst, const p256_jacobian *P)
841 {
842 unsigned char *buf;
843
844 buf = dst;
845 buf[0] = 0x04;
846 le13_to_be8(buf + 1, 32, P->x);
847 le13_to_be8(buf + 33, 32, P->y);
848 }
849
850 /*
851 * Multiply a curve point by an integer. The integer is assumed to be
852 * lower than the curve order, and the base point must not be the point
853 * at infinity.
854 */
855 static void
856 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
857 {
858 /*
859 * qz is a flag that is initially 1, and remains equal to 1
860 * as long as the point is the point at infinity.
861 *
862 * We use a 2-bit window to handle multiplier bits by pairs.
863 * The precomputed window really is the points P2 and P3.
864 */
865 uint32_t qz;
866 p256_jacobian P2, P3, Q, T, U;
867
868 /*
869 * Compute window values.
870 */
871 P2 = *P;
872 p256_double(&P2);
873 P3 = *P;
874 p256_add(&P3, &P2);
875
876 /*
877 * We start with Q = 0. We process multiplier bits 2 by 2.
878 */
879 memset(&Q, 0, sizeof Q);
880 qz = 1;
881 while (xlen -- > 0) {
882 int k;
883
884 for (k = 6; k >= 0; k -= 2) {
885 uint32_t bits;
886 uint32_t bnz;
887
888 p256_double(&Q);
889 p256_double(&Q);
890 T = *P;
891 U = Q;
892 bits = (*x >> k) & (uint32_t)3;
893 bnz = NEQ(bits, 0);
894 CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
895 CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
896 p256_add(&U, &T);
897 CCOPY(bnz & qz, &Q, &T, sizeof Q);
898 CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
899 qz &= ~bnz;
900 }
901 x ++;
902 }
903 *P = Q;
904 }
905
906 static const unsigned char P256_G[] = {
907 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
908 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
909 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
910 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
911 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
912 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
913 0x68, 0x37, 0xBF, 0x51, 0xF5
914 };
915
916 static const unsigned char P256_N[] = {
917 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
918 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
919 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
920 0x25, 0x51
921 };
922
923 static const unsigned char *
924 api_generator(int curve, size_t *len)
925 {
926 (void)curve;
927 *len = sizeof P256_G;
928 return P256_G;
929 }
930
931 static const unsigned char *
932 api_order(int curve, size_t *len)
933 {
934 (void)curve;
935 *len = sizeof P256_N;
936 return P256_N;
937 }
938
939 static uint32_t
940 api_mul(unsigned char *G, size_t Glen,
941 const unsigned char *x, size_t xlen, int curve)
942 {
943 uint32_t r;
944 p256_jacobian P;
945
946 (void)curve;
947 r = p256_decode(&P, G, Glen);
948 p256_mul(&P, x, xlen);
949 if (Glen >= 65) {
950 p256_to_affine(&P);
951 p256_encode(G, &P);
952 }
953 return r;
954 }
955
956 static uint32_t
957 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
958 const unsigned char *x, size_t xlen,
959 const unsigned char *y, size_t ylen, int curve)
960 {
961 p256_jacobian P, Q;
962 uint32_t r, t, z;
963 int i;
964
965 (void)curve;
966 r = p256_decode(&P, A, len);
967 r &= p256_decode(&Q, B, len);
968 p256_mul(&P, x, xlen);
969 p256_mul(&Q, y, ylen);
970
971 /*
972 * The final addition may fail in case both points are equal.
973 */
974 t = p256_add(&P, &Q);
975 reduce_final_f256(P.z);
976 z = 0;
977 for (i = 0; i < 20; i ++) {
978 z |= P.z[i];
979 }
980 z = EQ(z, 0);
981 p256_double(&Q);
982
983 /*
984 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
985 * have the following:
986 *
987 * z = 0, t = 0 return P (normal addition)
988 * z = 0, t = 1 return P (normal addition)
989 * z = 1, t = 0 return Q (a 'double' case)
990 * z = 1, t = 1 report an error (P+Q = 0)
991 */
992 CCOPY(z & ~t, &P, &Q, sizeof Q);
993 p256_to_affine(&P);
994 p256_encode(A, &P);
995 r &= ~(z & t);
996 return r;
997 }
998
999 /* see bearssl_ec.h */
1000 const br_ec_impl br_ec_p256_i15 = {
1001 (uint32_t)0x00800000,
1002 &api_generator,
1003 &api_order,
1004 &api_mul,
1005 &api_muladd
1006 };