_ Git - BearSSL/blob - src/ec/ec_p256_i15.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #else
  41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42 #endif
  43
  44 /*
  45  * Convert an integer from unsigned big-endian encoding to a sequence of
  46  * 13-bit words in little-endian order. The final "partial" word is
  47  * returned.
  48  */
  49 static uint32_t
  50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51 {
  52         uint32_t acc;
  53         int acc_len;
  54
  55         acc = 0;
  56         acc_len = 0;
  57         while (len -- > 0) {
  58                 acc |= (uint32_t)src[len] << acc_len;
  59                 acc_len += 8;
  60                 if (acc_len >= 13) {
  61                         *dst ++ = acc & 0x1FFF;
  62                         acc >>= 13;
  63                         acc_len -= 13;
  64                 }
  65         }
  66         return acc;
  67 }
  68
  69 /*
  70  * Convert an integer (13-bit words, little-endian) to unsigned
  71  * big-endian encoding. The total encoding length is provided; all
  72  * the destination bytes will be filled.
  73  */
  74 static void
  75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76 {
  77         uint32_t acc;
  78         int acc_len;
  79
  80         acc = 0;
  81         acc_len = 0;
  82         while (len -- > 0) {
  83                 if (acc_len < 8) {
  84                         acc |= (*src ++) << acc_len;
  85                         acc_len += 13;
  86                 }
  87                 dst[len] = (unsigned char)acc;
  88                 acc >>= 8;
  89                 acc_len -= 8;
  90         }
  91 }
  92
  93 /*
  94  * Multiply two 4-word integers. Basis is 2^13 = 8192. Individual words
  95  * may have value up to 32766. Result integer consists in eight words;
  96  * each word is in the 0..8191 range, except the last one (d[7]) which
  97  * gathers the excess bits.
  98  *
  99  * Max value for d[7], depending on max word value:
 100  *
 101  *   32764: 131071
 102  *   32765: 131080
 103  *   32766: 131088
 104  */
 105 static inline void
 106 mul4(uint32_t *d, const uint32_t *a, const uint32_t *b)
 107 {
 108         uint32_t t0, t1, t2, t3, t4, t5, t6;
 109
 110         t0 = MUL15(a[0], b[0]);
 111         t1 = MUL15(a[0], b[1]) + MUL15(a[1], b[0]);
 112         t2 = MUL15(a[0], b[2]) + MUL15(a[1], b[1]) + MUL15(a[2], b[0]);
 113         t3 = MUL15(a[0], b[3]) + MUL15(a[1], b[2])
 114                 + MUL15(a[2], b[1]) + MUL15(a[3], b[0]);
 115         t4 = MUL15(a[1], b[3]) + MUL15(a[2], b[2]) + MUL15(a[3], b[1]);
 116         t5 = MUL15(a[2], b[3]) + MUL15(a[3], b[2]);
 117         t6 = MUL15(a[3], b[3]);
 118
 119         /*
 120          * Maximum value is obtained when adding the carry to t3, when all
 121          * input words have maximum value. When the maximum is 32766,
 122          * the addition of t3 with the carry (t2 >> 13) yields 4294836223,
 123          * which is still below 2^32 = 4294967296.
 124          */
 125
 126         d[0] = t0 & 0x1FFF;
 127         t1 += t0 >> 13;
 128         d[1] = t1 & 0x1FFF;
 129         t2 += t1 >> 13;
 130         d[2] = t2 & 0x1FFF;
 131         t3 += t2 >> 13;
 132         d[3] = t3 & 0x1FFF;
 133         t4 += t3 >> 13;
 134         d[4] = t4 & 0x1FFF;
 135         t5 += t4 >> 13;
 136         d[5] = t5 & 0x1FFF;
 137         t6 += t5 >> 13;
 138         d[6] = t6 & 0x1FFF;
 139         d[7] = t6 >> 13;
 140 }
 141
 142 /*
 143  * Normalise an array of words to a strict 13 bits per word. Returned
 144  * value is the resulting carry. The source (w) and destination (d)
 145  * arrays may be identical, but shall not overlap partially.
 146  */
 147 static uint32_t
 148 norm13(uint32_t *d, const uint32_t *w, size_t len)
 149 {
 150         size_t u;
 151         uint32_t cc;
 152
 153         cc = 0;
 154         for (u = 0; u < len; u ++) {
 155                 int32_t z;
 156
 157                 z = w[u] + cc;
 158                 d[u] = z & 0x1FFF;
 159                 cc = ARSH(z, 13);
 160         }
 161         return cc;
 162 }
 163
 164 /*
 165  * Multiply two 20-word values together, result over 40 words. Each word
 166  * contains 13 bits of data; the top words (a[19] and b[19]) may contain
 167  * up to 15 bits each. Therefore this routine handles integer up to 2^262-1.
 168  * All words in the output have length 13 bits, except possibly the top
 169  * one, in case the sources had excess bits.
 170  */
 171 static void
 172 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 173 {
 174         /*
 175          * We first split the 20-word values into a 16-word value and
 176          * a 4-word value. The 16x16 product uses two levels of Karatsuba
 177          * decomposition. The preparatory additions are done word-wise
 178          * without carry propagation; since the input words have size
 179          * at most 13 bits, the sums may be up to 4*8191 = 32764, which
 180          * is in the range supported by mul4().
 181          */
 182         uint32_t u[72], v[72], w[144];
 183         uint32_t cc;
 184         int i;
 185
 186         /*
 187          * Karatsuba decomposition, two levels.
 188          *
 189          * off   src
 190          *       0..7 * 0..7
 191          *   0      0..3 * 0..3
 192          *   4      4..7 * 4..7
 193          *  16      0..3+4..7 * 0..3+4..7
 194          *
 195          *       8..15 * 8..15
 196          *   8      8..11 * 8..11
 197          *  12      12..15 * 12..15
 198          *  20      8..11+12..15 * 8..11+12..15
 199          *
 200          *       0..7+8..15 * 0..7+8..15
 201          *  24      0..3+8..11 * 0..3+8..11
 202          *  28      4..7+12..15 * 4..7+12..15
 203          *  32      0..3+4..7+8..11+12..15 * 0..3+4..7+8..11+12..15
 204          */
 205
 206 #define WADD(z, x, y)    do { \
 207                 u[(z) + 0] = u[(x) + 0] + u[y + 0]; \
 208                 u[(z) + 1] = u[(x) + 1] + u[y + 1]; \
 209                 u[(z) + 2] = u[(x) + 2] + u[y + 2]; \
 210                 u[(z) + 3] = u[(x) + 3] + u[y + 3]; \
 211                 v[(z) + 0] = v[(x) + 0] + v[y + 0]; \
 212                 v[(z) + 1] = v[(x) + 1] + v[y + 1]; \
 213                 v[(z) + 2] = v[(x) + 2] + v[y + 2]; \
 214                 v[(z) + 3] = v[(x) + 3] + v[y + 3]; \
 215         } while (0)
 216
 217         memcpy(u, a, 16 * sizeof *a);
 218         memcpy(v, b, 16 * sizeof *b);
 219         WADD(16, 0, 4);
 220         WADD(20, 8, 12);
 221         WADD(24, 0, 8);
 222         WADD(28, 4, 12);
 223         WADD(32, 16, 20);
 224         memcpy(u + 36, a, 16 * sizeof *a);
 225         memcpy(v + 52, b, 16 * sizeof *b);
 226         for (i = 0; i < 4; i ++) {
 227                 memcpy(u + 52 + (i << 2), a + 16, 4 * sizeof *a);
 228                 memcpy(v + 36 + (i << 2), b + 16, 4 * sizeof *b);
 229         }
 230         memcpy(u + 68, a + 16, 4 * sizeof *a);
 231         memcpy(v + 68, b + 16, 4 * sizeof *b);
 232
 233 #undef WADD
 234
 235         /*
 236          * Perform the elementary 4x4 multiplications:
 237          *   16x16: 9 multiplications (Karatsuba)
 238          *   16x4 (1): 4 multiplications
 239          *   16x4 (2): 4 multiplications
 240          *   4x4: 1 multiplication
 241          */
 242         for (i = 0; i < 18; i ++) {
 243                 mul4(w + (i << 3), u + (i << 2), v + (i << 2));
 244         }
 245
 246         /*
 247          * mul4 cross-product adjustments:
 248          *   subtract 0 and 8 from 32 (8 words)
 249          *   subtract 16 and 24 from 40 (8 words)
 250          *   subtract 48 and 56 from 64 (8 words)
 251          */
 252         for (i = 0; i < 8; i ++) {
 253                 w[i + 32] -= w[i +  0] + w[i +  8];
 254                 w[i + 40] -= w[i + 16] + w[i + 24];
 255                 w[i + 64] -= w[i + 48] + w[i + 56];
 256         }
 257
 258         /*
 259          * complete the three 8x8 products:
 260          *   add 32 to 4 (8 words)
 261          *   add 40 to 20 (8 words)
 262          *   add 64 to 52 (8 words)
 263          */
 264         for (i = 0; i < 8; i ++) {
 265                 w[i +  4] += w[i + 32];
 266                 w[i + 20] += w[i + 40];
 267                 w[i + 52] += w[i + 64];
 268         }
 269
 270         /*
 271          * Adjust the 16x16 product:
 272          *   subtract 0 from 48 (16 words)
 273          *   subtract 16 from 48 (16 words)
 274          *   add 48 to 8 (16 words)
 275          */
 276         for (i = 0; i < 16; i ++) {
 277                 w[i + 48] -= w[i +  0];
 278                 w[i + 48] -= w[i + 16];
 279         }
 280         for (i = 0; i < 16; i ++) {
 281                 w[i + 8] += w[i + 48];
 282         }
 283
 284         /*
 285          * At that point, the product of the low chunks (0..15 * 0..15)
 286          * is in words 0..31. We must add the three other partial products,
 287          * which begin at word 72 in w[]. Words 32 to 39 are first set to
 288          * the product of the high chunks (16..19 * 16..19), then the
 289          * low-high cross products are added in.
 290          */
 291         memcpy(w + 32, w + 136, 8 * sizeof w[0]);
 292         for (i = 0; i < 8; i ++) {
 293                 w[i + 16] += w[i + 72] + w[i + 104];
 294                 w[i + 20] += w[i + 80] + w[i + 112];
 295                 w[i + 24] += w[i + 88] + w[i + 120];
 296                 w[i + 28] += w[i + 96] + w[i + 128];
 297         }
 298
 299         /*
 300          * We did all the additions and subtractions in a word-wise way,
 301          * which is fine since we have plenty of extra bits for carries.
 302          * We must now do the carry propagation.
 303          */
 304         cc = norm13(d, w, 40);
 305         d[39] += (cc << 13);
 306 }
 307
 308 /*
 309  * Modulus for field F256 (field for point coordinates in curve P-256).
 310  */
 311 static const uint32_t F256[] = {
 312         0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 313         0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 314         0x0000, 0x1FF8, 0x1FFF, 0x01FF
 315 };
 316
 317 /*
 318  * The 'b' curve equation coefficient for P-256.
 319  */
 320 static const uint32_t P256_B[] = {
 321         0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
 322         0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
 323         0x0A3A, 0x0EC5, 0x118D, 0x00B5
 324 };
 325
 326 /*
 327  * Perform a "short reduction" in field F256 (field for curve P-256).
 328  * The source value should be less than 262 bits; on output, it will
 329  * be at most 257 bits, and less than twice the modulus.
 330  */
 331 static void
 332 reduce_f256(uint32_t *d)
 333 {
 334         uint32_t x;
 335
 336         x = d[19] >> 9;
 337         d[19] &= 0x01FF;
 338         d[17] += x << 3;
 339         d[14] -= x << 10;
 340         d[7] -= x << 5;
 341         d[0] += x;
 342         norm13(d, d, 20);
 343 }
 344
 345 /*
 346  * Perform a "final reduction" in field F256 (field for curve P-256).
 347  * The source value must be less than twice the modulus. If the value
 348  * is not lower than the modulus, then the modulus is subtracted and
 349  * this function returns 1; otherwise, it leaves it untouched and it
 350  * returns 0.
 351  */
 352 static uint32_t
 353 reduce_final_f256(uint32_t *d)
 354 {
 355         uint32_t t[20];
 356         uint32_t cc;
 357         int i;
 358
 359         memcpy(t, d, sizeof t);
 360         cc = 0;
 361         for (i = 0; i < 20; i ++) {
 362                 uint32_t w;
 363
 364                 w = t[i] - F256[i] - cc;
 365                 cc = w >> 31;
 366                 t[i] = w & 0x1FFF;
 367         }
 368         cc ^= 1;
 369         CCOPY(cc, d, t, sizeof t);
 370         return cc;
 371 }
 372
 373 /*
 374  * Perform a multiplication of two integers modulo
 375  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
 376  * of 20 words, each containing 13 bits of data, in little-endian order.
 377  * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
 378  * on output, value fits on 257 bits and is lower than twice the modulus.
 379  */
 380 static void
 381 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 382 {
 383         uint32_t t[40], cc;
 384         int i;
 385
 386         /*
 387          * Compute raw multiplication. All result words fit in 13 bits
 388          * each.
 389          */
 390         mul20(t, a, b);
 391
 392         /*
 393          * Modular reduction: each high word in added/subtracted where
 394          * necessary.
 395          *
 396          * The modulus is:
 397          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 398          * Therefore:
 399          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 400          *
 401          * For a word x at bit offset n (n >= 256), we have:
 402          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 403          *            - x*2^(n - 160) + x*2^(n-256) mod p
 404          *
 405          * Thus, we can nullify the high word if we reinject it at some
 406          * proper emplacements.
 407          */
 408         for (i = 39; i >= 20; i --) {
 409                 uint32_t x;
 410
 411                 x = t[i];
 412                 t[i - 2] += ARSH(x, 6);
 413                 t[i - 3] += (x << 7) & 0x1FFF;
 414                 t[i - 4] -= ARSH(x, 12);
 415                 t[i - 5] -= (x << 1) & 0x1FFF;
 416                 t[i - 12] -= ARSH(x, 4);
 417                 t[i - 13] -= (x << 9) & 0x1FFF;
 418                 t[i - 19] += ARSH(x, 9);
 419                 t[i - 20] += (x << 4) & 0x1FFF;
 420         }
 421
 422         /*
 423          * Propagate carries. Since the operation above really is a
 424          * truncature, followed by the addition of nonnegative values,
 425          * the result will be positive. Moreover, the carry cannot
 426          * exceed 5 bits (we performed 20 additions with values smaller
 427          * than 256 bits).
 428          */
 429         cc = norm13(t, t, 20);
 430
 431         /*
 432          * Perform modular reduction again for the bits beyond 256 (the carry
 433          * and the bits 256..259). This time, we can simply inject full
 434          * word values.
 435          */
 436         cc = (cc << 4) | (t[19] >> 9);
 437         t[19] &= 0x01FF;
 438         t[17] += cc << 3;
 439         t[14] -= cc << 10;
 440         t[7] -= cc << 5;
 441         t[0] += cc;
 442         norm13(d, t, 20);
 443 }
 444
 445 /*
 446  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
 447  * are such that:
 448  *   X = x / z^2
 449  *   Y = y / z^3
 450  * For the point at infinity, z = 0.
 451  * Each point thus admits many possible representations.
 452  *
 453  * Coordinates are represented in arrays of 32-bit integers, each holding
 454  * 13 bits of data. Values may also be slightly greater than the modulus,
 455  * but they will always be lower than twice the modulus.
 456  */
 457 typedef struct {
 458         uint32_t x[20];
 459         uint32_t y[20];
 460         uint32_t z[20];
 461 } p256_jacobian;
 462
 463 /*
 464  * Convert a point to affine coordinates:
 465  *  - If the point is the point at infinity, then all three coordinates
 466  *    are set to 0.
 467  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
 468  *    coordinates are the 'X' and 'Y' affine coordinates.
 469  * The coordinates are guaranteed to be lower than the modulus.
 470  */
 471 static void
 472 p256_to_affine(p256_jacobian *P)
 473 {
 474         uint32_t t1[20], t2[20];
 475         int i;
 476
 477         /*
 478          * Invert z with a modular exponentiation: the modulus is
 479          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
 480          * p-2. Exponent bit pattern (from high to low) is:
 481          *  - 32 bits of value 1
 482          *  - 31 bits of value 0
 483          *  - 1 bit of value 1
 484          *  - 96 bits of value 0
 485          *  - 94 bits of value 1
 486          *  - 1 bit of value 0
 487          *  - 1 bit of value 1
 488          * Thus, we precompute z^(2^31-1) to speed things up.
 489          *
 490          * If z = 0 (point at infinity) then the modular exponentiation
 491          * will yield 0, which leads to the expected result (all three
 492          * coordinates set to 0).
 493          */
 494
 495         /*
 496          * A simple square-and-multiply for z^(2^31-1). We could save about
 497          * two dozen multiplications here with an addition chain, but
 498          * this would require a bit more code, and extra stack buffers.
 499          */
 500         memcpy(t1, P->z, sizeof P->z);
 501         for (i = 0; i < 30; i ++) {
 502                 mul_f256(t1, t1, t1);
 503                 mul_f256(t1, t1, P->z);
 504         }
 505
 506         /*
 507          * Square-and-multiply. Apart from the squarings, we have a few
 508          * multiplications to set bits to 1; we multiply by the original z
 509          * for setting 1 bit, and by t1 for setting 31 bits.
 510          */
 511         memcpy(t2, P->z, sizeof P->z);
 512         for (i = 1; i < 256; i ++) {
 513                 mul_f256(t2, t2, t2);
 514                 switch (i) {
 515                 case 31:
 516                 case 190:
 517                 case 221:
 518                 case 252:
 519                         mul_f256(t2, t2, t1);
 520                         break;
 521                 case 63:
 522                 case 253:
 523                 case 255:
 524                         mul_f256(t2, t2, P->z);
 525                         break;
 526                 }
 527         }
 528
 529         /*
 530          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
 531          */
 532         mul_f256(t1, t2, t2);
 533         mul_f256(P->x, t1, P->x);
 534         mul_f256(t1, t1, t2);
 535         mul_f256(P->y, t1, P->y);
 536         reduce_final_f256(P->x);
 537         reduce_final_f256(P->y);
 538
 539         /*
 540          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
 541          * this will set z to 1.
 542          */
 543         mul_f256(P->z, P->z, t2);
 544         reduce_final_f256(P->z);
 545 }
 546
 547 /*
 548  * Double a point in P-256. This function works for all valid points,
 549  * including the point at infinity.
 550  */
 551 static void
 552 p256_double(p256_jacobian *Q)
 553 {
 554         /*
 555          * Doubling formulas are:
 556          *
 557          *   s = 4*x*y^2
 558          *   m = 3*(x + z^2)*(x - z^2)
 559          *   x' = m^2 - 2*s
 560          *   y' = m*(s - x') - 8*y^4
 561          *   z' = 2*y*z
 562          *
 563          * These formulas work for all points, including points of order 2
 564          * and points at infinity:
 565          *   - If y = 0 then z' = 0. But there is no such point in P-256
 566          *     anyway.
 567          *   - If z = 0 then z' = 0.
 568          */
 569         uint32_t t1[20], t2[20], t3[20], t4[20];
 570         int i;
 571
 572         /*
 573          * Compute z^2 in t1.
 574          */
 575         mul_f256(t1, Q->z, Q->z);
 576
 577         /*
 578          * Compute x-z^2 in t2 and x+z^2 in t1.
 579          */
 580         for (i = 0; i < 20; i ++) {
 581                 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
 582                 t1[i] += Q->x[i];
 583         }
 584         norm13(t1, t1, 20);
 585         norm13(t2, t2, 20);
 586
 587         /*
 588          * Compute 3*(x+z^2)*(x-z^2) in t1.
 589          */
 590         mul_f256(t3, t1, t2);
 591         for (i = 0; i < 20; i ++) {
 592                 t1[i] = MUL15(3, t3[i]);
 593         }
 594         norm13(t1, t1, 20);
 595
 596         /*
 597          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
 598          */
 599         mul_f256(t3, Q->y, Q->y);
 600         for (i = 0; i < 20; i ++) {
 601                 t3[i] <<= 1;
 602         }
 603         norm13(t3, t3, 20);
 604         mul_f256(t2, Q->x, t3);
 605         for (i = 0; i < 20; i ++) {
 606                 t2[i] <<= 1;
 607         }
 608         norm13(t2, t2, 20);
 609         reduce_f256(t2);
 610
 611         /*
 612          * Compute x' = m^2 - 2*s.
 613          */
 614         mul_f256(Q->x, t1, t1);
 615         for (i = 0; i < 20; i ++) {
 616                 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
 617         }
 618         norm13(Q->x, Q->x, 20);
 619         reduce_f256(Q->x);
 620
 621         /*
 622          * Compute z' = 2*y*z.
 623          */
 624         mul_f256(t4, Q->y, Q->z);
 625         for (i = 0; i < 20; i ++) {
 626                 Q->z[i] = t4[i] << 1;
 627         }
 628         norm13(Q->z, Q->z, 20);
 629         reduce_f256(Q->z);
 630
 631         /*
 632          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
 633          * 2*y^2 in t3.
 634          */
 635         for (i = 0; i < 20; i ++) {
 636                 t2[i] += (F256[i] << 1) - Q->x[i];
 637         }
 638         norm13(t2, t2, 20);
 639         mul_f256(Q->y, t1, t2);
 640         mul_f256(t4, t3, t3);
 641         for (i = 0; i < 20; i ++) {
 642                 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
 643         }
 644         norm13(Q->y, Q->y, 20);
 645         reduce_f256(Q->y);
 646 }
 647
 648 /*
 649  * Add point P2 to point P1.
 650  *
 651  * This function computes the wrong result in the following cases:
 652  *
 653  *   - If P1 == 0 but P2 != 0
 654  *   - If P1 != 0 but P2 == 0
 655  *   - If P1 == P2
 656  *
 657  * In all three cases, P1 is set to the point at infinity.
 658  *
 659  * Returned value is 0 if one of the following occurs:
 660  *
 661  *   - P1 and P2 have the same Y coordinate
 662  *   - P1 == 0 and P2 == 0
 663  *   - The Y coordinate of one of the points is 0 and the other point is
 664  *     the point at infinity.
 665  *
 666  * The third case cannot actually happen with valid points, since a point
 667  * with Y == 0 is a point of order 2, and there is no point of order 2 on
 668  * curve P-256.
 669  *
 670  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
 671  * can apply the following:
 672  *
 673  *   - If the result is not the point at infinity, then it is correct.
 674  *   - Otherwise, if the returned value is 1, then this is a case of
 675  *     P1+P2 == 0, so the result is indeed the point at infinity.
 676  *   - Otherwise, P1 == P2, so a "double" operation should have been
 677  *     performed.
 678  */
 679 static uint32_t
 680 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
 681 {
 682         /*
 683          * Addtions formulas are:
 684          *
 685          *   u1 = x1 * z2^2
 686          *   u2 = x2 * z1^2
 687          *   s1 = y1 * z2^3
 688          *   s2 = y2 * z1^3
 689          *   h = u2 - u1
 690          *   r = s2 - s1
 691          *   x3 = r^2 - h^3 - 2 * u1 * h^2
 692          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 693          *   z3 = h * z1 * z2
 694          */
 695         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
 696         uint32_t ret;
 697         int i;
 698
 699         /*
 700          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
 701          */
 702         mul_f256(t3, P2->z, P2->z);
 703         mul_f256(t1, P1->x, t3);
 704         mul_f256(t4, P2->z, t3);
 705         mul_f256(t3, P1->y, t4);
 706
 707         /*
 708          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 709          */
 710         mul_f256(t4, P1->z, P1->z);
 711         mul_f256(t2, P2->x, t4);
 712         mul_f256(t5, P1->z, t4);
 713         mul_f256(t4, P2->y, t5);
 714
 715         /*
 716          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 717          * We need to test whether r is zero, so we will do some extra
 718          * reduce.
 719          */
 720         for (i = 0; i < 20; i ++) {
 721                 t2[i] += (F256[i] << 1) - t1[i];
 722                 t4[i] += (F256[i] << 1) - t3[i];
 723         }
 724         norm13(t2, t2, 20);
 725         norm13(t4, t4, 20);
 726         reduce_f256(t4);
 727         reduce_final_f256(t4);
 728         ret = 0;
 729         for (i = 0; i < 20; i ++) {
 730                 ret |= t4[i];
 731         }
 732         ret = (ret | -ret) >> 31;
 733
 734         /*
 735          * Compute u1*h^2 (in t6) and h^3 (in t5);
 736          */
 737         mul_f256(t7, t2, t2);
 738         mul_f256(t6, t1, t7);
 739         mul_f256(t5, t7, t2);
 740
 741         /*
 742          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
 743          */
 744         mul_f256(P1->x, t4, t4);
 745         for (i = 0; i < 20; i ++) {
 746                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
 747         }
 748         norm13(P1->x, P1->x, 20);
 749         reduce_f256(P1->x);
 750
 751         /*
 752          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
 753          */
 754         for (i = 0; i < 20; i ++) {
 755                 t6[i] += (F256[i] << 1) - P1->x[i];
 756         }
 757         norm13(t6, t6, 20);
 758         mul_f256(P1->y, t4, t6);
 759         mul_f256(t1, t5, t3);
 760         for (i = 0; i < 20; i ++) {
 761                 P1->y[i] += (F256[i] << 1) - t1[i];
 762         }
 763         norm13(P1->y, P1->y, 20);
 764         reduce_f256(P1->y);
 765
 766         /*
 767          * Compute z3 = h*z1*z2.
 768          */
 769         mul_f256(t1, P1->z, P2->z);
 770         mul_f256(P1->z, t1, t2);
 771
 772         return ret;
 773 }
 774
 775 /*
 776  * Decode a P-256 point. This function does not support the point at
 777  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
 778  */
 779 static uint32_t
 780 p256_decode(p256_jacobian *P, const void *src, size_t len)
 781 {
 782         const unsigned char *buf;
 783         uint32_t tx[20], ty[20], t1[20], t2[20];
 784         uint32_t bad;
 785         int i;
 786
 787         if (len != 65) {
 788                 return 0;
 789         }
 790         buf = src;
 791
 792         /*
 793          * First byte must be 0x04 (uncompressed format). We could support
 794          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
 795          * least significant bit of the Y coordinate), but it is explicitly
 796          * forbidden by RFC 5480 (section 2.2).
 797          */
 798         bad = NEQ(buf[0], 0x04);
 799
 800         /*
 801          * Decode the coordinates, and check that they are both lower
 802          * than the modulus.
 803          */
 804         tx[19] = be8_to_le13(tx, buf + 1, 32);
 805         ty[19] = be8_to_le13(ty, buf + 33, 32);
 806         bad |= reduce_final_f256(tx);
 807         bad |= reduce_final_f256(ty);
 808
 809         /*
 810          * Check curve equation.
 811          */
 812         mul_f256(t1, tx, tx);
 813         mul_f256(t1, tx, t1);
 814         mul_f256(t2, ty, ty);
 815         for (i = 0; i < 20; i ++) {
 816                 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
 817         }
 818         norm13(t1, t1, 20);
 819         reduce_f256(t1);
 820         reduce_final_f256(t1);
 821         for (i = 0; i < 20; i ++) {
 822                 bad |= t1[i];
 823         }
 824
 825         /*
 826          * Copy coordinates to the point structure.
 827          */
 828         memcpy(P->x, tx, sizeof tx);
 829         memcpy(P->y, ty, sizeof ty);
 830         memset(P->z, 0, sizeof P->z);
 831         P->z[0] = 1;
 832         return NEQ(bad, 0) ^ 1;
 833 }
 834
 835 /*
 836  * Encode a point into a buffer. This function assumes that the point is
 837  * valid, in affine coordinates, and not the point at infinity.
 838  */
 839 static void
 840 p256_encode(void *dst, const p256_jacobian *P)
 841 {
 842         unsigned char *buf;
 843
 844         buf = dst;
 845         buf[0] = 0x04;
 846         le13_to_be8(buf + 1, 32, P->x);
 847         le13_to_be8(buf + 33, 32, P->y);
 848 }
 849
 850 /*
 851  * Multiply a curve point by an integer. The integer is assumed to be
 852  * lower than the curve order, and the base point must not be the point
 853  * at infinity.
 854  */
 855 static void
 856 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
 857 {
 858         /*
 859          * qz is a flag that is initially 1, and remains equal to 1
 860          * as long as the point is the point at infinity.
 861          *
 862          * We use a 2-bit window to handle multiplier bits by pairs.
 863          * The precomputed window really is the points P2 and P3.
 864          */
 865         uint32_t qz;
 866         p256_jacobian P2, P3, Q, T, U;
 867
 868         /*
 869          * Compute window values.
 870          */
 871         P2 = *P;
 872         p256_double(&P2);
 873         P3 = *P;
 874         p256_add(&P3, &P2);
 875
 876         /*
 877          * We start with Q = 0. We process multiplier bits 2 by 2.
 878          */
 879         memset(&Q, 0, sizeof Q);
 880         qz = 1;
 881         while (xlen -- > 0) {
 882                 int k;
 883
 884                 for (k = 6; k >= 0; k -= 2) {
 885                         uint32_t bits;
 886                         uint32_t bnz;
 887
 888                         p256_double(&Q);
 889                         p256_double(&Q);
 890                         T = *P;
 891                         U = Q;
 892                         bits = (*x >> k) & (uint32_t)3;
 893                         bnz = NEQ(bits, 0);
 894                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
 895                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
 896                         p256_add(&U, &T);
 897                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
 898                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
 899                         qz &= ~bnz;
 900                 }
 901                 x ++;
 902         }
 903         *P = Q;
 904 }
 905
 906 static const unsigned char P256_G[] = {
 907         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
 908         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
 909         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
 910         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
 911         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
 912         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
 913         0x68, 0x37, 0xBF, 0x51, 0xF5
 914 };
 915
 916 static const unsigned char P256_N[] = {
 917         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
 918         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
 919         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
 920         0x25, 0x51
 921 };
 922
 923 static const unsigned char *
 924 api_generator(int curve, size_t *len)
 925 {
 926         (void)curve;
 927         *len = sizeof P256_G;
 928         return P256_G;
 929 }
 930
 931 static const unsigned char *
 932 api_order(int curve, size_t *len)
 933 {
 934         (void)curve;
 935         *len = sizeof P256_N;
 936         return P256_N;
 937 }
 938
 939 static uint32_t
 940 api_mul(unsigned char *G, size_t Glen,
 941         const unsigned char *x, size_t xlen, int curve)
 942 {
 943         uint32_t r;
 944         p256_jacobian P;
 945
 946         (void)curve;
 947         r = p256_decode(&P, G, Glen);
 948         p256_mul(&P, x, xlen);
 949         if (Glen >= 65) {
 950                 p256_to_affine(&P);
 951                 p256_encode(G, &P);
 952         }
 953         return r;
 954 }
 955
 956 static uint32_t
 957 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 958         const unsigned char *x, size_t xlen,
 959         const unsigned char *y, size_t ylen, int curve)
 960 {
 961         p256_jacobian P, Q;
 962         uint32_t r, t, z;
 963         int i;
 964
 965         (void)curve;
 966         r = p256_decode(&P, A, len);
 967         r &= p256_decode(&Q, B, len);
 968         p256_mul(&P, x, xlen);
 969         p256_mul(&Q, y, ylen);
 970
 971         /*
 972          * The final addition may fail in case both points are equal.
 973          */
 974         t = p256_add(&P, &Q);
 975         reduce_final_f256(P.z);
 976         z = 0;
 977         for (i = 0; i < 20; i ++) {
 978                 z |= P.z[i];
 979         }
 980         z = EQ(z, 0);
 981         p256_double(&Q);
 982
 983         /*
 984          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
 985          * have the following:
 986          *
 987          *   z = 0, t = 0   return P (normal addition)
 988          *   z = 0, t = 1   return P (normal addition)
 989          *   z = 1, t = 0   return Q (a 'double' case)
 990          *   z = 1, t = 1   report an error (P+Q = 0)
 991          */
 992         CCOPY(z & ~t, &P, &Q, sizeof Q);
 993         p256_to_affine(&P);
 994         p256_encode(A, &P);
 995         r &= ~(z & t);
 996         return r;
 997 }
 998
 999 /* see bearssl_ec.h */
1000 const br_ec_impl br_ec_p256_i15 = {
1001         (uint32_t)0x00800000,
1002         &api_generator,
1003         &api_order,
1004         &api_mul,
1005         &api_muladd
1006 };