_ Git - BearSSL/blob - src/ec/ec_p256_i15.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29  * that right-shifting a signed negative integer copies the sign bit
  30  * (arithmetic right-shift). This is "implementation-defined behaviour",
  31  * i.e. it is not undefined, but it may differ between compilers. Each
  32  * compiler is supposed to document its behaviour in that respect. GCC
  33  * explicitly defines that an arithmetic right shift is used. We expect
  34  * all other compilers to do the same, because underlying CPU offer an
  35  * arithmetic right shift opcode that could not be used otherwise.
  36  */
  37 #if BR_NO_ARITH_SHIFT
  38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40 #else
  41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42 #endif
  43
  44 /*
  45  * Convert an integer from unsigned big-endian encoding to a sequence of
  46  * 13-bit words in little-endian order. The final "partial" word is
  47  * returned.
  48  */
  49 static uint32_t
  50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51 {
  52         uint32_t acc;
  53         int acc_len;
  54
  55         acc = 0;
  56         acc_len = 0;
  57         while (len -- > 0) {
  58                 acc |= (uint32_t)src[len] << acc_len;
  59                 acc_len += 8;
  60                 if (acc_len >= 13) {
  61                         *dst ++ = acc & 0x1FFF;
  62                         acc >>= 13;
  63                         acc_len -= 13;
  64                 }
  65         }
  66         return acc;
  67 }
  68
  69 /*
  70  * Convert an integer (13-bit words, little-endian) to unsigned
  71  * big-endian encoding. The total encoding length is provided; all
  72  * the destination bytes will be filled.
  73  */
  74 static void
  75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76 {
  77         uint32_t acc;
  78         int acc_len;
  79
  80         acc = 0;
  81         acc_len = 0;
  82         while (len -- > 0) {
  83                 if (acc_len < 8) {
  84                         acc |= (*src ++) << acc_len;
  85                         acc_len += 13;
  86                 }
  87                 dst[len] = (unsigned char)acc;
  88                 acc >>= 8;
  89                 acc_len -= 8;
  90         }
  91 }
  92
  93 /*
  94  * Normalise an array of words to a strict 13 bits per word. Returned
  95  * value is the resulting carry. The source (w) and destination (d)
  96  * arrays may be identical, but shall not overlap partially.
  97  */
  98 static inline uint32_t
  99 norm13(uint32_t *d, const uint32_t *w, size_t len)
 100 {
 101         size_t u;
 102         uint32_t cc;
 103
 104         cc = 0;
 105         for (u = 0; u < len; u ++) {
 106                 int32_t z;
 107
 108                 z = w[u] + cc;
 109                 d[u] = z & 0x1FFF;
 110                 cc = ARSH(z, 13);
 111         }
 112         return cc;
 113 }
 114
 115 /*
 116  * mul20() multiplies two 260-bit integers together. Each word must fit
 117  * on 13 bits; source operands use 20 words, destination operand
 118  * receives 40 words. All overlaps allowed.
 119  *
 120  *
 121  */
 122
 123 #if BR_SLOW_MUL15
 124
 125 static void
 126 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 127 {
 128         /*
 129          * Two-level Karatsuba: turns a 20x20 multiplication into
 130          * nine 5x5 multiplications. We use 13-bit words but do not
 131          * propagate carries immediately, so words may expand:
 132          *
 133          *  - First Karatsuba decomposition turns the 20x20 mul on
 134          *    13-bit words into three 10x10 muls, two on 13-bit words
 135          *    and one on 14-bit words.
 136          *
 137          *  - Second Karatsuba decomposition further splits these into:
 138          *
 139          *     * four 5x5 muls on 13-bit words
 140          *     * four 5x5 muls on 14-bit words
 141          *     * one 5x5 mul on 15-bit words
 142          *
 143          * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
 144          * or 15-bit words, respectively.
 145          */
 146         uint32_t u[45], v[45], w[90];
 147         uint32_t cc;
 148         int i;
 149
 150 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 151                 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
 152                         + (s2w)[5 * (s2_off) + 0]; \
 153                 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
 154                         + (s2w)[5 * (s2_off) + 1]; \
 155                 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
 156                         + (s2w)[5 * (s2_off) + 2]; \
 157                 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
 158                         + (s2w)[5 * (s2_off) + 3]; \
 159                 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
 160                         + (s2w)[5 * (s2_off) + 4]; \
 161         } while (0)
 162
 163 #define ZADDT(dw, d_off, sw, s_off)   do { \
 164                 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
 165                 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
 166                 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
 167                 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
 168                 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
 169         } while (0)
 170
 171 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 172                 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
 173                         + (s2w)[5 * (s2_off) + 0]; \
 174                 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
 175                         + (s2w)[5 * (s2_off) + 1]; \
 176                 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
 177                         + (s2w)[5 * (s2_off) + 2]; \
 178                 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
 179                         + (s2w)[5 * (s2_off) + 3]; \
 180                 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
 181                         + (s2w)[5 * (s2_off) + 4]; \
 182         } while (0)
 183
 184 #define CPR1(w, cprcc)   do { \
 185                 uint32_t cprz = (w) + cprcc; \
 186                 (w) = cprz & 0x1FFF; \
 187                 cprcc = cprz >> 13; \
 188         } while (0)
 189
 190 #define CPR(dw, d_off)   do { \
 191                 uint32_t cprcc; \
 192                 cprcc = 0; \
 193                 CPR1((dw)[(d_off) + 0], cprcc); \
 194                 CPR1((dw)[(d_off) + 1], cprcc); \
 195                 CPR1((dw)[(d_off) + 2], cprcc); \
 196                 CPR1((dw)[(d_off) + 3], cprcc); \
 197                 CPR1((dw)[(d_off) + 4], cprcc); \
 198                 CPR1((dw)[(d_off) + 5], cprcc); \
 199                 CPR1((dw)[(d_off) + 6], cprcc); \
 200                 CPR1((dw)[(d_off) + 7], cprcc); \
 201                 CPR1((dw)[(d_off) + 8], cprcc); \
 202                 (dw)[(d_off) + 9] = cprcc; \
 203         } while (0)
 204
 205         memcpy(u, a, 20 * sizeof *a);
 206         ZADD(u, 4, a, 0, a, 1);
 207         ZADD(u, 5, a, 2, a, 3);
 208         ZADD(u, 6, a, 0, a, 2);
 209         ZADD(u, 7, a, 1, a, 3);
 210         ZADD(u, 8, u, 6, u, 7);
 211
 212         memcpy(v, b, 20 * sizeof *b);
 213         ZADD(v, 4, b, 0, b, 1);
 214         ZADD(v, 5, b, 2, b, 3);
 215         ZADD(v, 6, b, 0, b, 2);
 216         ZADD(v, 7, b, 1, b, 3);
 217         ZADD(v, 8, v, 6, v, 7);
 218
 219         /*
 220          * Do the eight first 8x8 muls. Source words are at most 16382
 221          * each, so we can add product results together "as is" in 32-bit
 222          * words.
 223          */
 224         for (i = 0; i < 40; i += 5) {
 225                 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
 226                 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
 227                         + MUL15(u[i + 1], v[i + 0]);
 228                 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
 229                         + MUL15(u[i + 1], v[i + 1])
 230                         + MUL15(u[i + 2], v[i + 0]);
 231                 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
 232                         + MUL15(u[i + 1], v[i + 2])
 233                         + MUL15(u[i + 2], v[i + 1])
 234                         + MUL15(u[i + 3], v[i + 0]);
 235                 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
 236                         + MUL15(u[i + 1], v[i + 3])
 237                         + MUL15(u[i + 2], v[i + 2])
 238                         + MUL15(u[i + 3], v[i + 1])
 239                         + MUL15(u[i + 4], v[i + 0]);
 240                 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
 241                         + MUL15(u[i + 2], v[i + 3])
 242                         + MUL15(u[i + 3], v[i + 2])
 243                         + MUL15(u[i + 4], v[i + 1]);
 244                 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
 245                         + MUL15(u[i + 3], v[i + 3])
 246                         + MUL15(u[i + 4], v[i + 2]);
 247                 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
 248                         + MUL15(u[i + 4], v[i + 3]);
 249                 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
 250                 w[(i << 1) + 9] = 0;
 251         }
 252
 253         /*
 254          * For the 9th multiplication, source words are up to 32764,
 255          * so we must do some carry propagation. If we add up to
 256          * 4 products and the carry is no more than 524224, then the
 257          * result fits in 32 bits, and the next carry will be no more
 258          * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
 259          *
 260          * We thus just skip one of the products in the middle word,
 261          * then do a carry propagation (this reduces words to 13 bits
 262          * each, except possibly the last, which may use up to 17 bits
 263          * or so), then add the missing product.
 264          */
 265         w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
 266         w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
 267                 + MUL15(u[40 + 1], v[40 + 0]);
 268         w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
 269                 + MUL15(u[40 + 1], v[40 + 1])
 270                 + MUL15(u[40 + 2], v[40 + 0]);
 271         w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
 272                 + MUL15(u[40 + 1], v[40 + 2])
 273                 + MUL15(u[40 + 2], v[40 + 1])
 274                 + MUL15(u[40 + 3], v[40 + 0]);
 275         w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
 276                 + MUL15(u[40 + 1], v[40 + 3])
 277                 + MUL15(u[40 + 2], v[40 + 2])
 278                 + MUL15(u[40 + 3], v[40 + 1]);
 279                 /* + MUL15(u[40 + 4], v[40 + 0]) */
 280         w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
 281                 + MUL15(u[40 + 2], v[40 + 3])
 282                 + MUL15(u[40 + 3], v[40 + 2])
 283                 + MUL15(u[40 + 4], v[40 + 1]);
 284         w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
 285                 + MUL15(u[40 + 3], v[40 + 3])
 286                 + MUL15(u[40 + 4], v[40 + 2]);
 287         w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
 288                 + MUL15(u[40 + 4], v[40 + 3]);
 289         w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
 290
 291         CPR(w, 80);
 292
 293         w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
 294
 295         /*
 296          * The products on 14-bit words in slots 6 and 7 yield values
 297          * up to 5*(16382^2) each, and we need to subtract two such
 298          * values from the higher word. We need the subtraction to fit
 299          * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
 300          * However, 10*(16382^2) does not fit. So we must perform a
 301          * bit of reduction here.
 302          */
 303         CPR(w, 60);
 304         CPR(w, 70);
 305
 306         /*
 307          * Recompose results.
 308          */
 309
 310         /* 0..1*0..1 into 0..3 */
 311         ZSUB2F(w, 8, w, 0, w, 2);
 312         ZSUB2F(w, 9, w, 1, w, 3);
 313         ZADDT(w, 1, w, 8);
 314         ZADDT(w, 2, w, 9);
 315
 316         /* 2..3*2..3 into 4..7 */
 317         ZSUB2F(w, 10, w, 4, w, 6);
 318         ZSUB2F(w, 11, w, 5, w, 7);
 319         ZADDT(w, 5, w, 10);
 320         ZADDT(w, 6, w, 11);
 321
 322         /* (0..1+2..3)*(0..1+2..3) into 12..15 */
 323         ZSUB2F(w, 16, w, 12, w, 14);
 324         ZSUB2F(w, 17, w, 13, w, 15);
 325         ZADDT(w, 13, w, 16);
 326         ZADDT(w, 14, w, 17);
 327
 328         /* first-level recomposition */
 329         ZSUB2F(w, 12, w, 0, w, 4);
 330         ZSUB2F(w, 13, w, 1, w, 5);
 331         ZSUB2F(w, 14, w, 2, w, 6);
 332         ZSUB2F(w, 15, w, 3, w, 7);
 333         ZADDT(w, 2, w, 12);
 334         ZADDT(w, 3, w, 13);
 335         ZADDT(w, 4, w, 14);
 336         ZADDT(w, 5, w, 15);
 337
 338         /*
 339          * Perform carry propagation to bring all words down to 13 bits.
 340          */
 341         cc = norm13(d, w, 40);
 342         d[39] += (cc << 13);
 343
 344 #undef ZADD
 345 #undef ZADDT
 346 #undef ZSUB2F
 347 #undef CPR1
 348 #undef CPR
 349 }
 350
 351 #else
 352
 353 static void
 354 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 355 {
 356         uint32_t t[39];
 357
 358         t[ 0] = MUL15(a[ 0], b[ 0]);
 359         t[ 1] = MUL15(a[ 0], b[ 1])
 360                 + MUL15(a[ 1], b[ 0]);
 361         t[ 2] = MUL15(a[ 0], b[ 2])
 362                 + MUL15(a[ 1], b[ 1])
 363                 + MUL15(a[ 2], b[ 0]);
 364         t[ 3] = MUL15(a[ 0], b[ 3])
 365                 + MUL15(a[ 1], b[ 2])
 366                 + MUL15(a[ 2], b[ 1])
 367                 + MUL15(a[ 3], b[ 0]);
 368         t[ 4] = MUL15(a[ 0], b[ 4])
 369                 + MUL15(a[ 1], b[ 3])
 370                 + MUL15(a[ 2], b[ 2])
 371                 + MUL15(a[ 3], b[ 1])
 372                 + MUL15(a[ 4], b[ 0]);
 373         t[ 5] = MUL15(a[ 0], b[ 5])
 374                 + MUL15(a[ 1], b[ 4])
 375                 + MUL15(a[ 2], b[ 3])
 376                 + MUL15(a[ 3], b[ 2])
 377                 + MUL15(a[ 4], b[ 1])
 378                 + MUL15(a[ 5], b[ 0]);
 379         t[ 6] = MUL15(a[ 0], b[ 6])
 380                 + MUL15(a[ 1], b[ 5])
 381                 + MUL15(a[ 2], b[ 4])
 382                 + MUL15(a[ 3], b[ 3])
 383                 + MUL15(a[ 4], b[ 2])
 384                 + MUL15(a[ 5], b[ 1])
 385                 + MUL15(a[ 6], b[ 0]);
 386         t[ 7] = MUL15(a[ 0], b[ 7])
 387                 + MUL15(a[ 1], b[ 6])
 388                 + MUL15(a[ 2], b[ 5])
 389                 + MUL15(a[ 3], b[ 4])
 390                 + MUL15(a[ 4], b[ 3])
 391                 + MUL15(a[ 5], b[ 2])
 392                 + MUL15(a[ 6], b[ 1])
 393                 + MUL15(a[ 7], b[ 0]);
 394         t[ 8] = MUL15(a[ 0], b[ 8])
 395                 + MUL15(a[ 1], b[ 7])
 396                 + MUL15(a[ 2], b[ 6])
 397                 + MUL15(a[ 3], b[ 5])
 398                 + MUL15(a[ 4], b[ 4])
 399                 + MUL15(a[ 5], b[ 3])
 400                 + MUL15(a[ 6], b[ 2])
 401                 + MUL15(a[ 7], b[ 1])
 402                 + MUL15(a[ 8], b[ 0]);
 403         t[ 9] = MUL15(a[ 0], b[ 9])
 404                 + MUL15(a[ 1], b[ 8])
 405                 + MUL15(a[ 2], b[ 7])
 406                 + MUL15(a[ 3], b[ 6])
 407                 + MUL15(a[ 4], b[ 5])
 408                 + MUL15(a[ 5], b[ 4])
 409                 + MUL15(a[ 6], b[ 3])
 410                 + MUL15(a[ 7], b[ 2])
 411                 + MUL15(a[ 8], b[ 1])
 412                 + MUL15(a[ 9], b[ 0]);
 413         t[10] = MUL15(a[ 0], b[10])
 414                 + MUL15(a[ 1], b[ 9])
 415                 + MUL15(a[ 2], b[ 8])
 416                 + MUL15(a[ 3], b[ 7])
 417                 + MUL15(a[ 4], b[ 6])
 418                 + MUL15(a[ 5], b[ 5])
 419                 + MUL15(a[ 6], b[ 4])
 420                 + MUL15(a[ 7], b[ 3])
 421                 + MUL15(a[ 8], b[ 2])
 422                 + MUL15(a[ 9], b[ 1])
 423                 + MUL15(a[10], b[ 0]);
 424         t[11] = MUL15(a[ 0], b[11])
 425                 + MUL15(a[ 1], b[10])
 426                 + MUL15(a[ 2], b[ 9])
 427                 + MUL15(a[ 3], b[ 8])
 428                 + MUL15(a[ 4], b[ 7])
 429                 + MUL15(a[ 5], b[ 6])
 430                 + MUL15(a[ 6], b[ 5])
 431                 + MUL15(a[ 7], b[ 4])
 432                 + MUL15(a[ 8], b[ 3])
 433                 + MUL15(a[ 9], b[ 2])
 434                 + MUL15(a[10], b[ 1])
 435                 + MUL15(a[11], b[ 0]);
 436         t[12] = MUL15(a[ 0], b[12])
 437                 + MUL15(a[ 1], b[11])
 438                 + MUL15(a[ 2], b[10])
 439                 + MUL15(a[ 3], b[ 9])
 440                 + MUL15(a[ 4], b[ 8])
 441                 + MUL15(a[ 5], b[ 7])
 442                 + MUL15(a[ 6], b[ 6])
 443                 + MUL15(a[ 7], b[ 5])
 444                 + MUL15(a[ 8], b[ 4])
 445                 + MUL15(a[ 9], b[ 3])
 446                 + MUL15(a[10], b[ 2])
 447                 + MUL15(a[11], b[ 1])
 448                 + MUL15(a[12], b[ 0]);
 449         t[13] = MUL15(a[ 0], b[13])
 450                 + MUL15(a[ 1], b[12])
 451                 + MUL15(a[ 2], b[11])
 452                 + MUL15(a[ 3], b[10])
 453                 + MUL15(a[ 4], b[ 9])
 454                 + MUL15(a[ 5], b[ 8])
 455                 + MUL15(a[ 6], b[ 7])
 456                 + MUL15(a[ 7], b[ 6])
 457                 + MUL15(a[ 8], b[ 5])
 458                 + MUL15(a[ 9], b[ 4])
 459                 + MUL15(a[10], b[ 3])
 460                 + MUL15(a[11], b[ 2])
 461                 + MUL15(a[12], b[ 1])
 462                 + MUL15(a[13], b[ 0]);
 463         t[14] = MUL15(a[ 0], b[14])
 464                 + MUL15(a[ 1], b[13])
 465                 + MUL15(a[ 2], b[12])
 466                 + MUL15(a[ 3], b[11])
 467                 + MUL15(a[ 4], b[10])
 468                 + MUL15(a[ 5], b[ 9])
 469                 + MUL15(a[ 6], b[ 8])
 470                 + MUL15(a[ 7], b[ 7])
 471                 + MUL15(a[ 8], b[ 6])
 472                 + MUL15(a[ 9], b[ 5])
 473                 + MUL15(a[10], b[ 4])
 474                 + MUL15(a[11], b[ 3])
 475                 + MUL15(a[12], b[ 2])
 476                 + MUL15(a[13], b[ 1])
 477                 + MUL15(a[14], b[ 0]);
 478         t[15] = MUL15(a[ 0], b[15])
 479                 + MUL15(a[ 1], b[14])
 480                 + MUL15(a[ 2], b[13])
 481                 + MUL15(a[ 3], b[12])
 482                 + MUL15(a[ 4], b[11])
 483                 + MUL15(a[ 5], b[10])
 484                 + MUL15(a[ 6], b[ 9])
 485                 + MUL15(a[ 7], b[ 8])
 486                 + MUL15(a[ 8], b[ 7])
 487                 + MUL15(a[ 9], b[ 6])
 488                 + MUL15(a[10], b[ 5])
 489                 + MUL15(a[11], b[ 4])
 490                 + MUL15(a[12], b[ 3])
 491                 + MUL15(a[13], b[ 2])
 492                 + MUL15(a[14], b[ 1])
 493                 + MUL15(a[15], b[ 0]);
 494         t[16] = MUL15(a[ 0], b[16])
 495                 + MUL15(a[ 1], b[15])
 496                 + MUL15(a[ 2], b[14])
 497                 + MUL15(a[ 3], b[13])
 498                 + MUL15(a[ 4], b[12])
 499                 + MUL15(a[ 5], b[11])
 500                 + MUL15(a[ 6], b[10])
 501                 + MUL15(a[ 7], b[ 9])
 502                 + MUL15(a[ 8], b[ 8])
 503                 + MUL15(a[ 9], b[ 7])
 504                 + MUL15(a[10], b[ 6])
 505                 + MUL15(a[11], b[ 5])
 506                 + MUL15(a[12], b[ 4])
 507                 + MUL15(a[13], b[ 3])
 508                 + MUL15(a[14], b[ 2])
 509                 + MUL15(a[15], b[ 1])
 510                 + MUL15(a[16], b[ 0]);
 511         t[17] = MUL15(a[ 0], b[17])
 512                 + MUL15(a[ 1], b[16])
 513                 + MUL15(a[ 2], b[15])
 514                 + MUL15(a[ 3], b[14])
 515                 + MUL15(a[ 4], b[13])
 516                 + MUL15(a[ 5], b[12])
 517                 + MUL15(a[ 6], b[11])
 518                 + MUL15(a[ 7], b[10])
 519                 + MUL15(a[ 8], b[ 9])
 520                 + MUL15(a[ 9], b[ 8])
 521                 + MUL15(a[10], b[ 7])
 522                 + MUL15(a[11], b[ 6])
 523                 + MUL15(a[12], b[ 5])
 524                 + MUL15(a[13], b[ 4])
 525                 + MUL15(a[14], b[ 3])
 526                 + MUL15(a[15], b[ 2])
 527                 + MUL15(a[16], b[ 1])
 528                 + MUL15(a[17], b[ 0]);
 529         t[18] = MUL15(a[ 0], b[18])
 530                 + MUL15(a[ 1], b[17])
 531                 + MUL15(a[ 2], b[16])
 532                 + MUL15(a[ 3], b[15])
 533                 + MUL15(a[ 4], b[14])
 534                 + MUL15(a[ 5], b[13])
 535                 + MUL15(a[ 6], b[12])
 536                 + MUL15(a[ 7], b[11])
 537                 + MUL15(a[ 8], b[10])
 538                 + MUL15(a[ 9], b[ 9])
 539                 + MUL15(a[10], b[ 8])
 540                 + MUL15(a[11], b[ 7])
 541                 + MUL15(a[12], b[ 6])
 542                 + MUL15(a[13], b[ 5])
 543                 + MUL15(a[14], b[ 4])
 544                 + MUL15(a[15], b[ 3])
 545                 + MUL15(a[16], b[ 2])
 546                 + MUL15(a[17], b[ 1])
 547                 + MUL15(a[18], b[ 0]);
 548         t[19] = MUL15(a[ 0], b[19])
 549                 + MUL15(a[ 1], b[18])
 550                 + MUL15(a[ 2], b[17])
 551                 + MUL15(a[ 3], b[16])
 552                 + MUL15(a[ 4], b[15])
 553                 + MUL15(a[ 5], b[14])
 554                 + MUL15(a[ 6], b[13])
 555                 + MUL15(a[ 7], b[12])
 556                 + MUL15(a[ 8], b[11])
 557                 + MUL15(a[ 9], b[10])
 558                 + MUL15(a[10], b[ 9])
 559                 + MUL15(a[11], b[ 8])
 560                 + MUL15(a[12], b[ 7])
 561                 + MUL15(a[13], b[ 6])
 562                 + MUL15(a[14], b[ 5])
 563                 + MUL15(a[15], b[ 4])
 564                 + MUL15(a[16], b[ 3])
 565                 + MUL15(a[17], b[ 2])
 566                 + MUL15(a[18], b[ 1])
 567                 + MUL15(a[19], b[ 0]);
 568         t[20] = MUL15(a[ 1], b[19])
 569                 + MUL15(a[ 2], b[18])
 570                 + MUL15(a[ 3], b[17])
 571                 + MUL15(a[ 4], b[16])
 572                 + MUL15(a[ 5], b[15])
 573                 + MUL15(a[ 6], b[14])
 574                 + MUL15(a[ 7], b[13])
 575                 + MUL15(a[ 8], b[12])
 576                 + MUL15(a[ 9], b[11])
 577                 + MUL15(a[10], b[10])
 578                 + MUL15(a[11], b[ 9])
 579                 + MUL15(a[12], b[ 8])
 580                 + MUL15(a[13], b[ 7])
 581                 + MUL15(a[14], b[ 6])
 582                 + MUL15(a[15], b[ 5])
 583                 + MUL15(a[16], b[ 4])
 584                 + MUL15(a[17], b[ 3])
 585                 + MUL15(a[18], b[ 2])
 586                 + MUL15(a[19], b[ 1]);
 587         t[21] = MUL15(a[ 2], b[19])
 588                 + MUL15(a[ 3], b[18])
 589                 + MUL15(a[ 4], b[17])
 590                 + MUL15(a[ 5], b[16])
 591                 + MUL15(a[ 6], b[15])
 592                 + MUL15(a[ 7], b[14])
 593                 + MUL15(a[ 8], b[13])
 594                 + MUL15(a[ 9], b[12])
 595                 + MUL15(a[10], b[11])
 596                 + MUL15(a[11], b[10])
 597                 + MUL15(a[12], b[ 9])
 598                 + MUL15(a[13], b[ 8])
 599                 + MUL15(a[14], b[ 7])
 600                 + MUL15(a[15], b[ 6])
 601                 + MUL15(a[16], b[ 5])
 602                 + MUL15(a[17], b[ 4])
 603                 + MUL15(a[18], b[ 3])
 604                 + MUL15(a[19], b[ 2]);
 605         t[22] = MUL15(a[ 3], b[19])
 606                 + MUL15(a[ 4], b[18])
 607                 + MUL15(a[ 5], b[17])
 608                 + MUL15(a[ 6], b[16])
 609                 + MUL15(a[ 7], b[15])
 610                 + MUL15(a[ 8], b[14])
 611                 + MUL15(a[ 9], b[13])
 612                 + MUL15(a[10], b[12])
 613                 + MUL15(a[11], b[11])
 614                 + MUL15(a[12], b[10])
 615                 + MUL15(a[13], b[ 9])
 616                 + MUL15(a[14], b[ 8])
 617                 + MUL15(a[15], b[ 7])
 618                 + MUL15(a[16], b[ 6])
 619                 + MUL15(a[17], b[ 5])
 620                 + MUL15(a[18], b[ 4])
 621                 + MUL15(a[19], b[ 3]);
 622         t[23] = MUL15(a[ 4], b[19])
 623                 + MUL15(a[ 5], b[18])
 624                 + MUL15(a[ 6], b[17])
 625                 + MUL15(a[ 7], b[16])
 626                 + MUL15(a[ 8], b[15])
 627                 + MUL15(a[ 9], b[14])
 628                 + MUL15(a[10], b[13])
 629                 + MUL15(a[11], b[12])
 630                 + MUL15(a[12], b[11])
 631                 + MUL15(a[13], b[10])
 632                 + MUL15(a[14], b[ 9])
 633                 + MUL15(a[15], b[ 8])
 634                 + MUL15(a[16], b[ 7])
 635                 + MUL15(a[17], b[ 6])
 636                 + MUL15(a[18], b[ 5])
 637                 + MUL15(a[19], b[ 4]);
 638         t[24] = MUL15(a[ 5], b[19])
 639                 + MUL15(a[ 6], b[18])
 640                 + MUL15(a[ 7], b[17])
 641                 + MUL15(a[ 8], b[16])
 642                 + MUL15(a[ 9], b[15])
 643                 + MUL15(a[10], b[14])
 644                 + MUL15(a[11], b[13])
 645                 + MUL15(a[12], b[12])
 646                 + MUL15(a[13], b[11])
 647                 + MUL15(a[14], b[10])
 648                 + MUL15(a[15], b[ 9])
 649                 + MUL15(a[16], b[ 8])
 650                 + MUL15(a[17], b[ 7])
 651                 + MUL15(a[18], b[ 6])
 652                 + MUL15(a[19], b[ 5]);
 653         t[25] = MUL15(a[ 6], b[19])
 654                 + MUL15(a[ 7], b[18])
 655                 + MUL15(a[ 8], b[17])
 656                 + MUL15(a[ 9], b[16])
 657                 + MUL15(a[10], b[15])
 658                 + MUL15(a[11], b[14])
 659                 + MUL15(a[12], b[13])
 660                 + MUL15(a[13], b[12])
 661                 + MUL15(a[14], b[11])
 662                 + MUL15(a[15], b[10])
 663                 + MUL15(a[16], b[ 9])
 664                 + MUL15(a[17], b[ 8])
 665                 + MUL15(a[18], b[ 7])
 666                 + MUL15(a[19], b[ 6]);
 667         t[26] = MUL15(a[ 7], b[19])
 668                 + MUL15(a[ 8], b[18])
 669                 + MUL15(a[ 9], b[17])
 670                 + MUL15(a[10], b[16])
 671                 + MUL15(a[11], b[15])
 672                 + MUL15(a[12], b[14])
 673                 + MUL15(a[13], b[13])
 674                 + MUL15(a[14], b[12])
 675                 + MUL15(a[15], b[11])
 676                 + MUL15(a[16], b[10])
 677                 + MUL15(a[17], b[ 9])
 678                 + MUL15(a[18], b[ 8])
 679                 + MUL15(a[19], b[ 7]);
 680         t[27] = MUL15(a[ 8], b[19])
 681                 + MUL15(a[ 9], b[18])
 682                 + MUL15(a[10], b[17])
 683                 + MUL15(a[11], b[16])
 684                 + MUL15(a[12], b[15])
 685                 + MUL15(a[13], b[14])
 686                 + MUL15(a[14], b[13])
 687                 + MUL15(a[15], b[12])
 688                 + MUL15(a[16], b[11])
 689                 + MUL15(a[17], b[10])
 690                 + MUL15(a[18], b[ 9])
 691                 + MUL15(a[19], b[ 8]);
 692         t[28] = MUL15(a[ 9], b[19])
 693                 + MUL15(a[10], b[18])
 694                 + MUL15(a[11], b[17])
 695                 + MUL15(a[12], b[16])
 696                 + MUL15(a[13], b[15])
 697                 + MUL15(a[14], b[14])
 698                 + MUL15(a[15], b[13])
 699                 + MUL15(a[16], b[12])
 700                 + MUL15(a[17], b[11])
 701                 + MUL15(a[18], b[10])
 702                 + MUL15(a[19], b[ 9]);
 703         t[29] = MUL15(a[10], b[19])
 704                 + MUL15(a[11], b[18])
 705                 + MUL15(a[12], b[17])
 706                 + MUL15(a[13], b[16])
 707                 + MUL15(a[14], b[15])
 708                 + MUL15(a[15], b[14])
 709                 + MUL15(a[16], b[13])
 710                 + MUL15(a[17], b[12])
 711                 + MUL15(a[18], b[11])
 712                 + MUL15(a[19], b[10]);
 713         t[30] = MUL15(a[11], b[19])
 714                 + MUL15(a[12], b[18])
 715                 + MUL15(a[13], b[17])
 716                 + MUL15(a[14], b[16])
 717                 + MUL15(a[15], b[15])
 718                 + MUL15(a[16], b[14])
 719                 + MUL15(a[17], b[13])
 720                 + MUL15(a[18], b[12])
 721                 + MUL15(a[19], b[11]);
 722         t[31] = MUL15(a[12], b[19])
 723                 + MUL15(a[13], b[18])
 724                 + MUL15(a[14], b[17])
 725                 + MUL15(a[15], b[16])
 726                 + MUL15(a[16], b[15])
 727                 + MUL15(a[17], b[14])
 728                 + MUL15(a[18], b[13])
 729                 + MUL15(a[19], b[12]);
 730         t[32] = MUL15(a[13], b[19])
 731                 + MUL15(a[14], b[18])
 732                 + MUL15(a[15], b[17])
 733                 + MUL15(a[16], b[16])
 734                 + MUL15(a[17], b[15])
 735                 + MUL15(a[18], b[14])
 736                 + MUL15(a[19], b[13]);
 737         t[33] = MUL15(a[14], b[19])
 738                 + MUL15(a[15], b[18])
 739                 + MUL15(a[16], b[17])
 740                 + MUL15(a[17], b[16])
 741                 + MUL15(a[18], b[15])
 742                 + MUL15(a[19], b[14]);
 743         t[34] = MUL15(a[15], b[19])
 744                 + MUL15(a[16], b[18])
 745                 + MUL15(a[17], b[17])
 746                 + MUL15(a[18], b[16])
 747                 + MUL15(a[19], b[15]);
 748         t[35] = MUL15(a[16], b[19])
 749                 + MUL15(a[17], b[18])
 750                 + MUL15(a[18], b[17])
 751                 + MUL15(a[19], b[16]);
 752         t[36] = MUL15(a[17], b[19])
 753                 + MUL15(a[18], b[18])
 754                 + MUL15(a[19], b[17]);
 755         t[37] = MUL15(a[18], b[19])
 756                 + MUL15(a[19], b[18]);
 757         t[38] = MUL15(a[19], b[19]);
 758         d[39] = norm13(d, t, 39);
 759 }
 760
 761 #endif
 762
 763 /*
 764  * Modulus for field F256 (field for point coordinates in curve P-256).
 765  */
 766 static const uint32_t F256[] = {
 767         0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 768         0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 769         0x0000, 0x1FF8, 0x1FFF, 0x01FF
 770 };
 771
 772 /*
 773  * The 'b' curve equation coefficient for P-256.
 774  */
 775 static const uint32_t P256_B[] = {
 776         0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
 777         0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
 778         0x0A3A, 0x0EC5, 0x118D, 0x00B5
 779 };
 780
 781 /*
 782  * Perform a "short reduction" in field F256 (field for curve P-256).
 783  * The source value should be less than 262 bits; on output, it will
 784  * be at most 257 bits, and less than twice the modulus.
 785  */
 786 static void
 787 reduce_f256(uint32_t *d)
 788 {
 789         uint32_t x;
 790
 791         x = d[19] >> 9;
 792         d[19] &= 0x01FF;
 793         d[17] += x << 3;
 794         d[14] -= x << 10;
 795         d[7] -= x << 5;
 796         d[0] += x;
 797         norm13(d, d, 20);
 798 }
 799
 800 /*
 801  * Perform a "final reduction" in field F256 (field for curve P-256).
 802  * The source value must be less than twice the modulus. If the value
 803  * is not lower than the modulus, then the modulus is subtracted and
 804  * this function returns 1; otherwise, it leaves it untouched and it
 805  * returns 0.
 806  */
 807 static uint32_t
 808 reduce_final_f256(uint32_t *d)
 809 {
 810         uint32_t t[20];
 811         uint32_t cc;
 812         int i;
 813
 814         memcpy(t, d, sizeof t);
 815         cc = 0;
 816         for (i = 0; i < 20; i ++) {
 817                 uint32_t w;
 818
 819                 w = t[i] - F256[i] - cc;
 820                 cc = w >> 31;
 821                 t[i] = w & 0x1FFF;
 822         }
 823         cc ^= 1;
 824         CCOPY(cc, d, t, sizeof t);
 825         return cc;
 826 }
 827
 828 /*
 829  * Perform a multiplication of two integers modulo
 830  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
 831  * of 20 words, each containing 13 bits of data, in little-endian order.
 832  * On input, upper word may be up to 15 bits (hence value up to 2^262-1);
 833  * on output, value fits on 257 bits and is lower than twice the modulus.
 834  */
 835 static void
 836 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 837 {
 838         uint32_t t[40], cc;
 839         int i;
 840
 841         /*
 842          * Compute raw multiplication. All result words fit in 13 bits
 843          * each.
 844          */
 845         mul20(t, a, b);
 846
 847         /*
 848          * Modular reduction: each high word in added/subtracted where
 849          * necessary.
 850          *
 851          * The modulus is:
 852          *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 853          * Therefore:
 854          *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 855          *
 856          * For a word x at bit offset n (n >= 256), we have:
 857          *    x*2^n = x*2^(n-32) - x*2^(n-64)
 858          *            - x*2^(n - 160) + x*2^(n-256) mod p
 859          *
 860          * Thus, we can nullify the high word if we reinject it at some
 861          * proper emplacements.
 862          */
 863         for (i = 39; i >= 20; i --) {
 864                 uint32_t x;
 865
 866                 x = t[i];
 867                 t[i - 2] += ARSH(x, 6);
 868                 t[i - 3] += (x << 7) & 0x1FFF;
 869                 t[i - 4] -= ARSH(x, 12);
 870                 t[i - 5] -= (x << 1) & 0x1FFF;
 871                 t[i - 12] -= ARSH(x, 4);
 872                 t[i - 13] -= (x << 9) & 0x1FFF;
 873                 t[i - 19] += ARSH(x, 9);
 874                 t[i - 20] += (x << 4) & 0x1FFF;
 875         }
 876
 877         /*
 878          * Propagate carries. Since the operation above really is a
 879          * truncature, followed by the addition of nonnegative values,
 880          * the result will be positive. Moreover, the carry cannot
 881          * exceed 5 bits (we performed 20 additions with values smaller
 882          * than 256 bits).
 883          */
 884         cc = norm13(t, t, 20);
 885
 886         /*
 887          * Perform modular reduction again for the bits beyond 256 (the carry
 888          * and the bits 256..259). This time, we can simply inject full
 889          * word values.
 890          */
 891         cc = (cc << 4) | (t[19] >> 9);
 892         t[19] &= 0x01FF;
 893         t[17] += cc << 3;
 894         t[14] -= cc << 10;
 895         t[7] -= cc << 5;
 896         t[0] += cc;
 897         norm13(d, t, 20);
 898 }
 899
 900 /*
 901  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
 902  * are such that:
 903  *   X = x / z^2
 904  *   Y = y / z^3
 905  * For the point at infinity, z = 0.
 906  * Each point thus admits many possible representations.
 907  *
 908  * Coordinates are represented in arrays of 32-bit integers, each holding
 909  * 13 bits of data. Values may also be slightly greater than the modulus,
 910  * but they will always be lower than twice the modulus.
 911  */
 912 typedef struct {
 913         uint32_t x[20];
 914         uint32_t y[20];
 915         uint32_t z[20];
 916 } p256_jacobian;
 917
 918 /*
 919  * Convert a point to affine coordinates:
 920  *  - If the point is the point at infinity, then all three coordinates
 921  *    are set to 0.
 922  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
 923  *    coordinates are the 'X' and 'Y' affine coordinates.
 924  * The coordinates are guaranteed to be lower than the modulus.
 925  */
 926 static void
 927 p256_to_affine(p256_jacobian *P)
 928 {
 929         uint32_t t1[20], t2[20];
 930         int i;
 931
 932         /*
 933          * Invert z with a modular exponentiation: the modulus is
 934          * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
 935          * p-2. Exponent bit pattern (from high to low) is:
 936          *  - 32 bits of value 1
 937          *  - 31 bits of value 0
 938          *  - 1 bit of value 1
 939          *  - 96 bits of value 0
 940          *  - 94 bits of value 1
 941          *  - 1 bit of value 0
 942          *  - 1 bit of value 1
 943          * Thus, we precompute z^(2^31-1) to speed things up.
 944          *
 945          * If z = 0 (point at infinity) then the modular exponentiation
 946          * will yield 0, which leads to the expected result (all three
 947          * coordinates set to 0).
 948          */
 949
 950         /*
 951          * A simple square-and-multiply for z^(2^31-1). We could save about
 952          * two dozen multiplications here with an addition chain, but
 953          * this would require a bit more code, and extra stack buffers.
 954          */
 955         memcpy(t1, P->z, sizeof P->z);
 956         for (i = 0; i < 30; i ++) {
 957                 mul_f256(t1, t1, t1);
 958                 mul_f256(t1, t1, P->z);
 959         }
 960
 961         /*
 962          * Square-and-multiply. Apart from the squarings, we have a few
 963          * multiplications to set bits to 1; we multiply by the original z
 964          * for setting 1 bit, and by t1 for setting 31 bits.
 965          */
 966         memcpy(t2, P->z, sizeof P->z);
 967         for (i = 1; i < 256; i ++) {
 968                 mul_f256(t2, t2, t2);
 969                 switch (i) {
 970                 case 31:
 971                 case 190:
 972                 case 221:
 973                 case 252:
 974                         mul_f256(t2, t2, t1);
 975                         break;
 976                 case 63:
 977                 case 253:
 978                 case 255:
 979                         mul_f256(t2, t2, P->z);
 980                         break;
 981                 }
 982         }
 983
 984         /*
 985          * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
 986          */
 987         mul_f256(t1, t2, t2);
 988         mul_f256(P->x, t1, P->x);
 989         mul_f256(t1, t1, t2);
 990         mul_f256(P->y, t1, P->y);
 991         reduce_final_f256(P->x);
 992         reduce_final_f256(P->y);
 993
 994         /*
 995          * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
 996          * this will set z to 1.
 997          */
 998         mul_f256(P->z, P->z, t2);
 999         reduce_final_f256(P->z);
1000 }
1001
1002 /*
1003  * Double a point in P-256. This function works for all valid points,
1004  * including the point at infinity.
1005  */
1006 static void
1007 p256_double(p256_jacobian *Q)
1008 {
1009         /*
1010          * Doubling formulas are:
1011          *
1012          *   s = 4*x*y^2
1013          *   m = 3*(x + z^2)*(x - z^2)
1014          *   x' = m^2 - 2*s
1015          *   y' = m*(s - x') - 8*y^4
1016          *   z' = 2*y*z
1017          *
1018          * These formulas work for all points, including points of order 2
1019          * and points at infinity:
1020          *   - If y = 0 then z' = 0. But there is no such point in P-256
1021          *     anyway.
1022          *   - If z = 0 then z' = 0.
1023          */
1024         uint32_t t1[20], t2[20], t3[20], t4[20];
1025         int i;
1026
1027         /*
1028          * Compute z^2 in t1.
1029          */
1030         mul_f256(t1, Q->z, Q->z);
1031
1032         /*
1033          * Compute x-z^2 in t2 and x+z^2 in t1.
1034          */
1035         for (i = 0; i < 20; i ++) {
1036                 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1037                 t1[i] += Q->x[i];
1038         }
1039         norm13(t1, t1, 20);
1040         norm13(t2, t2, 20);
1041
1042         /*
1043          * Compute 3*(x+z^2)*(x-z^2) in t1.
1044          */
1045         mul_f256(t3, t1, t2);
1046         for (i = 0; i < 20; i ++) {
1047                 t1[i] = MUL15(3, t3[i]);
1048         }
1049         norm13(t1, t1, 20);
1050
1051         /*
1052          * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1053          */
1054         mul_f256(t3, Q->y, Q->y);
1055         for (i = 0; i < 20; i ++) {
1056                 t3[i] <<= 1;
1057         }
1058         norm13(t3, t3, 20);
1059         mul_f256(t2, Q->x, t3);
1060         for (i = 0; i < 20; i ++) {
1061                 t2[i] <<= 1;
1062         }
1063         norm13(t2, t2, 20);
1064         reduce_f256(t2);
1065
1066         /*
1067          * Compute x' = m^2 - 2*s.
1068          */
1069         mul_f256(Q->x, t1, t1);
1070         for (i = 0; i < 20; i ++) {
1071                 Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1072         }
1073         norm13(Q->x, Q->x, 20);
1074         reduce_f256(Q->x);
1075
1076         /*
1077          * Compute z' = 2*y*z.
1078          */
1079         mul_f256(t4, Q->y, Q->z);
1080         for (i = 0; i < 20; i ++) {
1081                 Q->z[i] = t4[i] << 1;
1082         }
1083         norm13(Q->z, Q->z, 20);
1084         reduce_f256(Q->z);
1085
1086         /*
1087          * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1088          * 2*y^2 in t3.
1089          */
1090         for (i = 0; i < 20; i ++) {
1091                 t2[i] += (F256[i] << 1) - Q->x[i];
1092         }
1093         norm13(t2, t2, 20);
1094         mul_f256(Q->y, t1, t2);
1095         mul_f256(t4, t3, t3);
1096         for (i = 0; i < 20; i ++) {
1097                 Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1098         }
1099         norm13(Q->y, Q->y, 20);
1100         reduce_f256(Q->y);
1101 }
1102
1103 /*
1104  * Add point P2 to point P1.
1105  *
1106  * This function computes the wrong result in the following cases:
1107  *
1108  *   - If P1 == 0 but P2 != 0
1109  *   - If P1 != 0 but P2 == 0
1110  *   - If P1 == P2
1111  *
1112  * In all three cases, P1 is set to the point at infinity.
1113  *
1114  * Returned value is 0 if one of the following occurs:
1115  *
1116  *   - P1 and P2 have the same Y coordinate
1117  *   - P1 == 0 and P2 == 0
1118  *   - The Y coordinate of one of the points is 0 and the other point is
1119  *     the point at infinity.
1120  *
1121  * The third case cannot actually happen with valid points, since a point
1122  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1123  * curve P-256.
1124  *
1125  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1126  * can apply the following:
1127  *
1128  *   - If the result is not the point at infinity, then it is correct.
1129  *   - Otherwise, if the returned value is 1, then this is a case of
1130  *     P1+P2 == 0, so the result is indeed the point at infinity.
1131  *   - Otherwise, P1 == P2, so a "double" operation should have been
1132  *     performed.
1133  */
1134 static uint32_t
1135 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1136 {
1137         /*
1138          * Addtions formulas are:
1139          *
1140          *   u1 = x1 * z2^2
1141          *   u2 = x2 * z1^2
1142          *   s1 = y1 * z2^3
1143          *   s2 = y2 * z1^3
1144          *   h = u2 - u1
1145          *   r = s2 - s1
1146          *   x3 = r^2 - h^3 - 2 * u1 * h^2
1147          *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1148          *   z3 = h * z1 * z2
1149          */
1150         uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1151         uint32_t ret;
1152         int i;
1153
1154         /*
1155          * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1156          */
1157         mul_f256(t3, P2->z, P2->z);
1158         mul_f256(t1, P1->x, t3);
1159         mul_f256(t4, P2->z, t3);
1160         mul_f256(t3, P1->y, t4);
1161
1162         /*
1163          * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1164          */
1165         mul_f256(t4, P1->z, P1->z);
1166         mul_f256(t2, P2->x, t4);
1167         mul_f256(t5, P1->z, t4);
1168         mul_f256(t4, P2->y, t5);
1169
1170         /*
1171          * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1172          * We need to test whether r is zero, so we will do some extra
1173          * reduce.
1174          */
1175         for (i = 0; i < 20; i ++) {
1176                 t2[i] += (F256[i] << 1) - t1[i];
1177                 t4[i] += (F256[i] << 1) - t3[i];
1178         }
1179         norm13(t2, t2, 20);
1180         norm13(t4, t4, 20);
1181         reduce_f256(t4);
1182         reduce_final_f256(t4);
1183         ret = 0;
1184         for (i = 0; i < 20; i ++) {
1185                 ret |= t4[i];
1186         }
1187         ret = (ret | -ret) >> 31;
1188
1189         /*
1190          * Compute u1*h^2 (in t6) and h^3 (in t5);
1191          */
1192         mul_f256(t7, t2, t2);
1193         mul_f256(t6, t1, t7);
1194         mul_f256(t5, t7, t2);
1195
1196         /*
1197          * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1198          */
1199         mul_f256(P1->x, t4, t4);
1200         for (i = 0; i < 20; i ++) {
1201                 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1202         }
1203         norm13(P1->x, P1->x, 20);
1204         reduce_f256(P1->x);
1205
1206         /*
1207          * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1208          */
1209         for (i = 0; i < 20; i ++) {
1210                 t6[i] += (F256[i] << 1) - P1->x[i];
1211         }
1212         norm13(t6, t6, 20);
1213         mul_f256(P1->y, t4, t6);
1214         mul_f256(t1, t5, t3);
1215         for (i = 0; i < 20; i ++) {
1216                 P1->y[i] += (F256[i] << 1) - t1[i];
1217         }
1218         norm13(P1->y, P1->y, 20);
1219         reduce_f256(P1->y);
1220
1221         /*
1222          * Compute z3 = h*z1*z2.
1223          */
1224         mul_f256(t1, P1->z, P2->z);
1225         mul_f256(P1->z, t1, t2);
1226
1227         return ret;
1228 }
1229
1230 /*
1231  * Decode a P-256 point. This function does not support the point at
1232  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1233  */
1234 static uint32_t
1235 p256_decode(p256_jacobian *P, const void *src, size_t len)
1236 {
1237         const unsigned char *buf;
1238         uint32_t tx[20], ty[20], t1[20], t2[20];
1239         uint32_t bad;
1240         int i;
1241
1242         if (len != 65) {
1243                 return 0;
1244         }
1245         buf = src;
1246
1247         /*
1248          * First byte must be 0x04 (uncompressed format). We could support
1249          * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1250          * least significant bit of the Y coordinate), but it is explicitly
1251          * forbidden by RFC 5480 (section 2.2).
1252          */
1253         bad = NEQ(buf[0], 0x04);
1254
1255         /*
1256          * Decode the coordinates, and check that they are both lower
1257          * than the modulus.
1258          */
1259         tx[19] = be8_to_le13(tx, buf + 1, 32);
1260         ty[19] = be8_to_le13(ty, buf + 33, 32);
1261         bad |= reduce_final_f256(tx);
1262         bad |= reduce_final_f256(ty);
1263
1264         /*
1265          * Check curve equation.
1266          */
1267         mul_f256(t1, tx, tx);
1268         mul_f256(t1, tx, t1);
1269         mul_f256(t2, ty, ty);
1270         for (i = 0; i < 20; i ++) {
1271                 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1272         }
1273         norm13(t1, t1, 20);
1274         reduce_f256(t1);
1275         reduce_final_f256(t1);
1276         for (i = 0; i < 20; i ++) {
1277                 bad |= t1[i];
1278         }
1279
1280         /*
1281          * Copy coordinates to the point structure.
1282          */
1283         memcpy(P->x, tx, sizeof tx);
1284         memcpy(P->y, ty, sizeof ty);
1285         memset(P->z, 0, sizeof P->z);
1286         P->z[0] = 1;
1287         return NEQ(bad, 0) ^ 1;
1288 }
1289
1290 /*
1291  * Encode a point into a buffer. This function assumes that the point is
1292  * valid, in affine coordinates, and not the point at infinity.
1293  */
1294 static void
1295 p256_encode(void *dst, const p256_jacobian *P)
1296 {
1297         unsigned char *buf;
1298
1299         buf = dst;
1300         buf[0] = 0x04;
1301         le13_to_be8(buf + 1, 32, P->x);
1302         le13_to_be8(buf + 33, 32, P->y);
1303 }
1304
1305 /*
1306  * Multiply a curve point by an integer. The integer is assumed to be
1307  * lower than the curve order, and the base point must not be the point
1308  * at infinity.
1309  */
1310 static void
1311 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1312 {
1313         /*
1314          * qz is a flag that is initially 1, and remains equal to 1
1315          * as long as the point is the point at infinity.
1316          *
1317          * We use a 2-bit window to handle multiplier bits by pairs.
1318          * The precomputed window really is the points P2 and P3.
1319          */
1320         uint32_t qz;
1321         p256_jacobian P2, P3, Q, T, U;
1322
1323         /*
1324          * Compute window values.
1325          */
1326         P2 = *P;
1327         p256_double(&P2);
1328         P3 = *P;
1329         p256_add(&P3, &P2);
1330
1331         /*
1332          * We start with Q = 0. We process multiplier bits 2 by 2.
1333          */
1334         memset(&Q, 0, sizeof Q);
1335         qz = 1;
1336         while (xlen -- > 0) {
1337                 int k;
1338
1339                 for (k = 6; k >= 0; k -= 2) {
1340                         uint32_t bits;
1341                         uint32_t bnz;
1342
1343                         p256_double(&Q);
1344                         p256_double(&Q);
1345                         T = *P;
1346                         U = Q;
1347                         bits = (*x >> k) & (uint32_t)3;
1348                         bnz = NEQ(bits, 0);
1349                         CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1350                         CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1351                         p256_add(&U, &T);
1352                         CCOPY(bnz & qz, &Q, &T, sizeof Q);
1353                         CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1354                         qz &= ~bnz;
1355                 }
1356                 x ++;
1357         }
1358         *P = Q;
1359 }
1360
1361 static const unsigned char P256_G[] = {
1362         0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1363         0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1364         0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1365         0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1366         0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1367         0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1368         0x68, 0x37, 0xBF, 0x51, 0xF5
1369 };
1370
1371 static const unsigned char P256_N[] = {
1372         0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1373         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1374         0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1375         0x25, 0x51
1376 };
1377
1378 static const unsigned char *
1379 api_generator(int curve, size_t *len)
1380 {
1381         (void)curve;
1382         *len = sizeof P256_G;
1383         return P256_G;
1384 }
1385
1386 static const unsigned char *
1387 api_order(int curve, size_t *len)
1388 {
1389         (void)curve;
1390         *len = sizeof P256_N;
1391         return P256_N;
1392 }
1393
1394 static uint32_t
1395 api_mul(unsigned char *G, size_t Glen,
1396         const unsigned char *x, size_t xlen, int curve)
1397 {
1398         uint32_t r;
1399         p256_jacobian P;
1400
1401         (void)curve;
1402         r = p256_decode(&P, G, Glen);
1403         p256_mul(&P, x, xlen);
1404         if (Glen >= 65) {
1405                 p256_to_affine(&P);
1406                 p256_encode(G, &P);
1407         }
1408         return r;
1409 }
1410
1411 static uint32_t
1412 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1413         const unsigned char *x, size_t xlen,
1414         const unsigned char *y, size_t ylen, int curve)
1415 {
1416         p256_jacobian P, Q;
1417         uint32_t r, t, z;
1418         int i;
1419
1420         (void)curve;
1421         r = p256_decode(&P, A, len);
1422         r &= p256_decode(&Q, B, len);
1423         p256_mul(&P, x, xlen);
1424         p256_mul(&Q, y, ylen);
1425
1426         /*
1427          * The final addition may fail in case both points are equal.
1428          */
1429         t = p256_add(&P, &Q);
1430         reduce_final_f256(P.z);
1431         z = 0;
1432         for (i = 0; i < 20; i ++) {
1433                 z |= P.z[i];
1434         }
1435         z = EQ(z, 0);
1436         p256_double(&Q);
1437
1438         /*
1439          * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1440          * have the following:
1441          *
1442          *   z = 0, t = 0   return P (normal addition)
1443          *   z = 0, t = 1   return P (normal addition)
1444          *   z = 1, t = 0   return Q (a 'double' case)
1445          *   z = 1, t = 1   report an error (P+Q = 0)
1446          */
1447         CCOPY(z & ~t, &P, &Q, sizeof Q);
1448         p256_to_affine(&P);
1449         p256_encode(A, &P);
1450         r &= ~(z & t);
1451         return r;
1452 }
1453
1454 /* see bearssl_ec.h */
1455 const br_ec_impl br_ec_p256_i15 = {
1456         (uint32_t)0x00800000,
1457         &api_generator,
1458         &api_order,
1459         &api_mul,
1460         &api_muladd
1461 };