_ Git - BearSSL/blob - src/ec/ec_c25519_m31.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /* obsolete
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 static void
  31 print_int(const char *name, const uint32_t *x)
  32 {
  33         size_t u;
  34         unsigned char tmp[40];
  35
  36         printf("%s = ", name);
  37         for (u = 0; u < 9; u ++) {
  38                 if (x[u] > 0x3FFFFFFF) {
  39                         printf("INVALID:");
  40                         for (u = 0; u < 9; u ++) {
  41                                 printf(" %08X", x[u]);
  42                         }
  43                         printf("\n");
  44                         return;
  45                 }
  46         }
  47         memset(tmp, 0, sizeof tmp);
  48         for (u = 0; u < 9; u ++) {
  49                 uint64_t w;
  50                 int j, k;
  51
  52                 w = x[u];
  53                 j = 30 * (int)u;
  54                 k = j & 7;
  55                 if (k != 0) {
  56                         w <<= k;
  57                         j -= k;
  58                 }
  59                 k = j >> 3;
  60                 for (j = 0; j < 8; j ++) {
  61                         tmp[39 - k - j] |= (unsigned char)w;
  62                         w >>= 8;
  63                 }
  64         }
  65         for (u = 8; u < 40; u ++) {
  66                 printf("%02X", tmp[u]);
  67         }
  68         printf("\n");
  69 }
  70 */
  71
  72 /*
  73  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  74  * that right-shifting a signed negative integer copies the sign bit
  75  * (arithmetic right-shift). This is "implementation-defined behaviour",
  76  * i.e. it is not undefined, but it may differ between compilers. Each
  77  * compiler is supposed to document its behaviour in that respect. GCC
  78  * explicitly defines that an arithmetic right shift is used. We expect
  79  * all other compilers to do the same, because underlying CPU offer an
  80  * arithmetic right shift opcode that could not be used otherwise.
  81  */
  82 #if BR_NO_ARITH_SHIFT
  83 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  84                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  85 #else
  86 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  87 #endif
  88
  89 /*
  90  * Convert an integer from unsigned little-endian encoding to a sequence of
  91  * 30-bit words in little-endian order. The final "partial" word is
  92  * returned.
  93  */
  94 static uint32_t
  95 le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
  96 {
  97         uint32_t acc;
  98         int acc_len;
  99
 100         acc = 0;
 101         acc_len = 0;
 102         while (len -- > 0) {
 103                 uint32_t b;
 104
 105                 b = *src ++;
 106                 if (acc_len < 22) {
 107                         acc |= b << acc_len;
 108                         acc_len += 8;
 109                 } else {
 110                         *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
 111                         acc = b >> (30 - acc_len);
 112                         acc_len -= 22;
 113                 }
 114         }
 115         return acc;
 116 }
 117
 118 /*
 119  * Convert an integer (30-bit words, little-endian) to unsigned
 120  * little-endian encoding. The total encoding length is provided; all
 121  * the destination bytes will be filled.
 122  */
 123 static void
 124 le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
 125 {
 126         uint32_t acc;
 127         int acc_len;
 128
 129         acc = 0;
 130         acc_len = 0;
 131         while (len -- > 0) {
 132                 if (acc_len < 8) {
 133                         uint32_t w;
 134
 135                         w = *src ++;
 136                         *dst ++ = (unsigned char)(acc | (w << acc_len));
 137                         acc = w >> (8 - acc_len);
 138                         acc_len += 22;
 139                 } else {
 140                         *dst ++ = (unsigned char)acc;
 141                         acc >>= 8;
 142                         acc_len -= 8;
 143                 }
 144         }
 145 }
 146
 147 /*
 148  * Multiply two integers. Source integers are represented as arrays of
 149  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
 150  * 18 words of 30 bits each.
 151  */
 152 static void
 153 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
 154 {
 155         /*
 156          * Maximum intermediate result is no more than
 157          * 10376293531797946367, which fits in 64 bits. Reason:
 158          *
 159          *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
 160          *   10376293531797946367 < 9663676407 * 2^30
 161          *
 162          * Thus, adding together 9 products of 30-bit integers, with
 163          * a carry of at most 9663676406, yields an integer that fits
 164          * on 64 bits and generates a carry of at most 9663676406.
 165          */
 166         uint64_t t[17];
 167         uint64_t cc;
 168         int i;
 169
 170         t[ 0] = MUL31(a[0], b[0]);
 171         t[ 1] = MUL31(a[0], b[1])
 172                 + MUL31(a[1], b[0]);
 173         t[ 2] = MUL31(a[0], b[2])
 174                 + MUL31(a[1], b[1])
 175                 + MUL31(a[2], b[0]);
 176         t[ 3] = MUL31(a[0], b[3])
 177                 + MUL31(a[1], b[2])
 178                 + MUL31(a[2], b[1])
 179                 + MUL31(a[3], b[0]);
 180         t[ 4] = MUL31(a[0], b[4])
 181                 + MUL31(a[1], b[3])
 182                 + MUL31(a[2], b[2])
 183                 + MUL31(a[3], b[1])
 184                 + MUL31(a[4], b[0]);
 185         t[ 5] = MUL31(a[0], b[5])
 186                 + MUL31(a[1], b[4])
 187                 + MUL31(a[2], b[3])
 188                 + MUL31(a[3], b[2])
 189                 + MUL31(a[4], b[1])
 190                 + MUL31(a[5], b[0]);
 191         t[ 6] = MUL31(a[0], b[6])
 192                 + MUL31(a[1], b[5])
 193                 + MUL31(a[2], b[4])
 194                 + MUL31(a[3], b[3])
 195                 + MUL31(a[4], b[2])
 196                 + MUL31(a[5], b[1])
 197                 + MUL31(a[6], b[0]);
 198         t[ 7] = MUL31(a[0], b[7])
 199                 + MUL31(a[1], b[6])
 200                 + MUL31(a[2], b[5])
 201                 + MUL31(a[3], b[4])
 202                 + MUL31(a[4], b[3])
 203                 + MUL31(a[5], b[2])
 204                 + MUL31(a[6], b[1])
 205                 + MUL31(a[7], b[0]);
 206         t[ 8] = MUL31(a[0], b[8])
 207                 + MUL31(a[1], b[7])
 208                 + MUL31(a[2], b[6])
 209                 + MUL31(a[3], b[5])
 210                 + MUL31(a[4], b[4])
 211                 + MUL31(a[5], b[3])
 212                 + MUL31(a[6], b[2])
 213                 + MUL31(a[7], b[1])
 214                 + MUL31(a[8], b[0]);
 215         t[ 9] = MUL31(a[1], b[8])
 216                 + MUL31(a[2], b[7])
 217                 + MUL31(a[3], b[6])
 218                 + MUL31(a[4], b[5])
 219                 + MUL31(a[5], b[4])
 220                 + MUL31(a[6], b[3])
 221                 + MUL31(a[7], b[2])
 222                 + MUL31(a[8], b[1]);
 223         t[10] = MUL31(a[2], b[8])
 224                 + MUL31(a[3], b[7])
 225                 + MUL31(a[4], b[6])
 226                 + MUL31(a[5], b[5])
 227                 + MUL31(a[6], b[4])
 228                 + MUL31(a[7], b[3])
 229                 + MUL31(a[8], b[2]);
 230         t[11] = MUL31(a[3], b[8])
 231                 + MUL31(a[4], b[7])
 232                 + MUL31(a[5], b[6])
 233                 + MUL31(a[6], b[5])
 234                 + MUL31(a[7], b[4])
 235                 + MUL31(a[8], b[3]);
 236         t[12] = MUL31(a[4], b[8])
 237                 + MUL31(a[5], b[7])
 238                 + MUL31(a[6], b[6])
 239                 + MUL31(a[7], b[5])
 240                 + MUL31(a[8], b[4]);
 241         t[13] = MUL31(a[5], b[8])
 242                 + MUL31(a[6], b[7])
 243                 + MUL31(a[7], b[6])
 244                 + MUL31(a[8], b[5]);
 245         t[14] = MUL31(a[6], b[8])
 246                 + MUL31(a[7], b[7])
 247                 + MUL31(a[8], b[6]);
 248         t[15] = MUL31(a[7], b[8])
 249                 + MUL31(a[8], b[7]);
 250         t[16] = MUL31(a[8], b[8]);
 251
 252         /*
 253          * Propagate carries.
 254          */
 255         cc = 0;
 256         for (i = 0; i < 17; i ++) {
 257                 uint64_t w;
 258
 259                 w = t[i] + cc;
 260                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 261                 cc = w >> 30;
 262         }
 263         d[17] = (uint32_t)cc;
 264 }
 265
 266 /*
 267  * Square a 270-bit integer, represented as an array of nine 30-bit words.
 268  * Result uses 18 words of 30 bits each.
 269  */
 270 static void
 271 square9(uint32_t *d, const uint32_t *a)
 272 {
 273         uint64_t t[17];
 274         uint64_t cc;
 275         int i;
 276
 277         t[ 0] = MUL31(a[0], a[0]);
 278         t[ 1] = ((MUL31(a[0], a[1])) << 1);
 279         t[ 2] = MUL31(a[1], a[1])
 280                 + ((MUL31(a[0], a[2])) << 1);
 281         t[ 3] = ((MUL31(a[0], a[3])
 282                 + MUL31(a[1], a[2])) << 1);
 283         t[ 4] = MUL31(a[2], a[2])
 284                 + ((MUL31(a[0], a[4])
 285                 + MUL31(a[1], a[3])) << 1);
 286         t[ 5] = ((MUL31(a[0], a[5])
 287                 + MUL31(a[1], a[4])
 288                 + MUL31(a[2], a[3])) << 1);
 289         t[ 6] = MUL31(a[3], a[3])
 290                 + ((MUL31(a[0], a[6])
 291                 + MUL31(a[1], a[5])
 292                 + MUL31(a[2], a[4])) << 1);
 293         t[ 7] = ((MUL31(a[0], a[7])
 294                 + MUL31(a[1], a[6])
 295                 + MUL31(a[2], a[5])
 296                 + MUL31(a[3], a[4])) << 1);
 297         t[ 8] = MUL31(a[4], a[4])
 298                 + ((MUL31(a[0], a[8])
 299                 + MUL31(a[1], a[7])
 300                 + MUL31(a[2], a[6])
 301                 + MUL31(a[3], a[5])) << 1);
 302         t[ 9] = ((MUL31(a[1], a[8])
 303                 + MUL31(a[2], a[7])
 304                 + MUL31(a[3], a[6])
 305                 + MUL31(a[4], a[5])) << 1);
 306         t[10] = MUL31(a[5], a[5])
 307                 + ((MUL31(a[2], a[8])
 308                 + MUL31(a[3], a[7])
 309                 + MUL31(a[4], a[6])) << 1);
 310         t[11] = ((MUL31(a[3], a[8])
 311                 + MUL31(a[4], a[7])
 312                 + MUL31(a[5], a[6])) << 1);
 313         t[12] = MUL31(a[6], a[6])
 314                 + ((MUL31(a[4], a[8])
 315                 + MUL31(a[5], a[7])) << 1);
 316         t[13] = ((MUL31(a[5], a[8])
 317                 + MUL31(a[6], a[7])) << 1);
 318         t[14] = MUL31(a[7], a[7])
 319                 + ((MUL31(a[6], a[8])) << 1);
 320         t[15] = ((MUL31(a[7], a[8])) << 1);
 321         t[16] = MUL31(a[8], a[8]);
 322
 323         /*
 324          * Propagate carries.
 325          */
 326         cc = 0;
 327         for (i = 0; i < 17; i ++) {
 328                 uint64_t w;
 329
 330                 w = t[i] + cc;
 331                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 332                 cc = w >> 30;
 333         }
 334         d[17] = (uint32_t)cc;
 335 }
 336
 337 /*
 338  * Perform a "final reduction" in field F255 (field for Curve25519)
 339  * The source value must be less than twice the modulus. If the value
 340  * is not lower than the modulus, then the modulus is subtracted and
 341  * this function returns 1; otherwise, it leaves it untouched and it
 342  * returns 0.
 343  */
 344 static uint32_t
 345 reduce_final_f255(uint32_t *d)
 346 {
 347         uint32_t t[9];
 348         uint32_t cc;
 349         int i;
 350
 351         memcpy(t, d, sizeof t);
 352         cc = 19;
 353         for (i = 0; i < 9; i ++) {
 354                 uint32_t w;
 355
 356                 w = t[i] + cc;
 357                 cc = w >> 30;
 358                 t[i] = w & 0x3FFFFFFF;
 359         }
 360         cc = t[8] >> 15;
 361         t[8] &= 0x7FFF;
 362         CCOPY(cc, d, t, sizeof t);
 363         return cc;
 364 }
 365
 366 /*
 367  * Perform a multiplication of two integers modulo 2^255-19.
 368  * Operands are arrays of 9 words, each containing 30 bits of data, in
 369  * little-endian order. Input value may be up to 2^256-1; on output, value
 370  * fits on 256 bits and is lower than twice the modulus.
 371  */
 372 static void
 373 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 374 {
 375         uint32_t t[18], cc;
 376         int i;
 377
 378         /*
 379          * Compute raw multiplication. All result words fit in 30 bits
 380          * each; upper word (t[17]) must fit on 2 bits, since the product
 381          * of two 256-bit integers must fit on 512 bits.
 382          */
 383         mul9(t, a, b);
 384
 385         /*
 386          * Modular reduction: each high word is added where necessary.
 387          * Since the modulus is 2^255-19 and word 9 corresponds to
 388          * offset 9*30 = 270, word 9+k must be added to word k with
 389          * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
 390          * added that way.
 391          *
 392          * Keeping the carry on 32 bits helps with 32-bit architectures,
 393          * and does not noticeably impact performance on 64-bit systems.
 394          */
 395         cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
 396         t[8] &= 0x7FFF;
 397         for (i = 0; i < 9; i ++) {
 398                 uint64_t w;
 399
 400                 w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 401                 t[i] = (uint32_t)w & 0x3FFFFFFF;
 402                 cc = (uint32_t)(w >> 30);  /* at most 622592 */
 403         }
 404
 405         /*
 406          * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
 407          * This was split into two parts (upper of 257 bits, lower of 255
 408          * bits), and the upper was added to the lower with a factor 19,
 409          * which means that the intermediate value is less than 77*2^255
 410          * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
 411          * less than 77, and the initial carry cc is at most 76*19 = 1444.
 412          */
 413         cc = MUL15(t[8] >> 15, 19);
 414         t[8] &= 0x7FFF;
 415         for (i = 0; i < 9; i ++) {
 416                 uint32_t z;
 417
 418                 z = t[i] + cc;
 419                 d[i] = z & 0x3FFFFFFF;
 420                 cc = z >> 30;
 421         }
 422
 423         /*
 424          * Final result is at most 2^255 + 1443. In particular, the last
 425          * carry is necessarily 0, since t[8] was truncated to 15 bits.
 426          */
 427 }
 428
 429 /*
 430  * Perform a squaring of an integer modulo 2^255-19.
 431  * Operands are arrays of 9 words, each containing 30 bits of data, in
 432  * little-endian order. Input value may be up to 2^256-1; on output, value
 433  * fits on 256 bits and is lower than twice the modulus.
 434  */
 435 static void
 436 f255_square(uint32_t *d, const uint32_t *a)
 437 {
 438         uint32_t t[18], cc;
 439         int i;
 440
 441         /*
 442          * Compute raw squaring. All result words fit in 30 bits
 443          * each; upper word (t[17]) must fit on 2 bits, since the square
 444          * of a 256-bit integers must fit on 512 bits.
 445          */
 446         square9(t, a);
 447
 448         /*
 449          * Modular reduction: each high word is added where necessary.
 450          * See f255_mul() for details on the reduction and carry limits.
 451          */
 452         cc = MUL15(t[8] >> 15, 19);
 453         t[8] &= 0x7FFF;
 454         for (i = 0; i < 9; i ++) {
 455                 uint64_t w;
 456
 457                 w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
 458                 t[i] = (uint32_t)w & 0x3FFFFFFF;
 459                 cc = (uint32_t)(w >> 30);
 460         }
 461         cc = MUL15(t[8] >> 15, 19);
 462         t[8] &= 0x7FFF;
 463         for (i = 0; i < 9; i ++) {
 464                 uint32_t z;
 465
 466                 z = t[i] + cc;
 467                 d[i] = z & 0x3FFFFFFF;
 468                 cc = z >> 30;
 469         }
 470 }
 471
 472 /*
 473  * Add two values in F255. Partial reduction is performed (down to less
 474  * than twice the modulus).
 475  */
 476 static void
 477 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
 478 {
 479         /*
 480          * Since operand words fit on 30 bits, we can use 32-bit
 481          * variables throughout.
 482          */
 483         int i;
 484         uint32_t cc, w;
 485
 486         cc = 0;
 487         for (i = 0; i < 9; i ++) {
 488                 w = a[i] + b[i] + cc;
 489                 d[i] = w & 0x3FFFFFFF;
 490                 cc = w >> 30;
 491         }
 492         cc = MUL15(w >> 15, 19);
 493         d[8] &= 0x7FFF;
 494         for (i = 0; i < 9; i ++) {
 495                 w = d[i] + cc;
 496                 d[i] = w & 0x3FFFFFFF;
 497                 cc = w >> 30;
 498         }
 499 }
 500
 501 /*
 502  * Subtract one value from another in F255. Partial reduction is
 503  * performed (down to less than twice the modulus).
 504  */
 505 static void
 506 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
 507 {
 508         /*
 509          * We actually compute a - b + 2*p, so that the final value is
 510          * necessarily positive.
 511          */
 512         int i;
 513         uint32_t cc, w;
 514
 515         cc = (uint32_t)-38;
 516         for (i = 0; i < 9; i ++) {
 517                 w = a[i] - b[i] + cc;
 518                 d[i] = w & 0x3FFFFFFF;
 519                 cc = ARSH(w, 30);
 520         }
 521         cc = MUL15((w + 0x10000) >> 15, 19);
 522         d[8] &= 0x7FFF;
 523         for (i = 0; i < 9; i ++) {
 524                 w = d[i] + cc;
 525                 d[i] = w & 0x3FFFFFFF;
 526                 cc = w >> 30;
 527         }
 528 }
 529
 530 /*
 531  * Multiply an integer by the 'A24' constant (121665). Partial reduction
 532  * is performed (down to less than twice the modulus).
 533  */
 534 static void
 535 f255_mul_a24(uint32_t *d, const uint32_t *a)
 536 {
 537         int i;
 538         uint64_t w;
 539         uint32_t cc;
 540
 541         /*
 542          * a[] is over 256 bits, thus a[8] has length at most 16 bits.
 543          * We single out the processing of the last word: intermediate
 544          * value w is up to 121665*2^16, yielding a carry for the next
 545          * loop of at most 19*(121665*2^16/2^15) = 4623289.
 546          */
 547         cc = 0;
 548         for (i = 0; i < 8; i ++) {
 549                 w = MUL31(a[i], 121665) + (uint64_t)cc;
 550                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 551                 cc = (uint32_t)(w >> 30);
 552         }
 553         w = MUL31(a[8], 121665) + (uint64_t)cc;
 554         d[8] = (uint32_t)w & 0x7FFF;
 555         cc = MUL15((uint32_t)(w >> 15), 19);
 556
 557         for (i = 0; i < 9; i ++) {
 558                 uint32_t z;
 559
 560                 z = d[i] + cc;
 561                 d[i] = z & 0x3FFFFFFF;
 562                 cc = z >> 30;
 563         }
 564 }
 565
 566 static const unsigned char GEN[] = {
 567         0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 568         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 569         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 570         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 571 };
 572
 573 static const unsigned char ORDER[] = {
 574         0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 575         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 576         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 577         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 578 };
 579
 580 static const unsigned char *
 581 api_generator(int curve, size_t *len)
 582 {
 583         (void)curve;
 584         *len = 32;
 585         return GEN;
 586 }
 587
 588 static const unsigned char *
 589 api_order(int curve, size_t *len)
 590 {
 591         (void)curve;
 592         *len = 32;
 593         return ORDER;
 594 }
 595
 596 static size_t
 597 api_xoff(int curve, size_t *len)
 598 {
 599         (void)curve;
 600         *len = 32;
 601         return 0;
 602 }
 603
 604 static void
 605 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
 606 {
 607         int i;
 608
 609         ctl = -ctl;
 610         for (i = 0; i < 9; i ++) {
 611                 uint32_t aw, bw, tw;
 612
 613                 aw = a[i];
 614                 bw = b[i];
 615                 tw = ctl & (aw ^ bw);
 616                 a[i] = aw ^ tw;
 617                 b[i] = bw ^ tw;
 618         }
 619 }
 620
 621 static uint32_t
 622 api_mul(unsigned char *G, size_t Glen,
 623         const unsigned char *kb, size_t kblen, int curve)
 624 {
 625         uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
 626         uint32_t a[9], aa[9], b[9], bb[9];
 627         uint32_t c[9], d[9], e[9], da[9], cb[9];
 628         unsigned char k[32];
 629         uint32_t swap;
 630         int i;
 631
 632         (void)curve;
 633
 634         /*
 635          * Points are encoded over exactly 32 bytes. Multipliers must fit
 636          * in 32 bytes as well.
 637          * RFC 7748 mandates that the high bit of the last point byte must
 638          * be ignored/cleared.
 639          */
 640         if (Glen != 32 || kblen > 32) {
 641                 return 0;
 642         }
 643         G[31] &= 0x7F;
 644
 645         /*
 646          * Initialise variables x1, x2, z2, x3 and z3. We set all of them
 647          * into Montgomery representation.
 648          */
 649         x1[8] = le8_to_le30(x1, G, 32);
 650         memcpy(x3, x1, sizeof x1);
 651         memset(z2, 0, sizeof z2);
 652         memset(x2, 0, sizeof x2);
 653         x2[0] = 1;
 654         memset(z3, 0, sizeof z3);
 655         z3[0] = 1;
 656
 657         memset(k, 0, (sizeof k) - kblen);
 658         memcpy(k + (sizeof k) - kblen, kb, kblen);
 659         k[31] &= 0xF8;
 660         k[0] &= 0x7F;
 661         k[0] |= 0x40;
 662
 663         /* obsolete
 664         print_int("x1", x1);
 665         */
 666
 667         swap = 0;
 668         for (i = 254; i >= 0; i --) {
 669                 uint32_t kt;
 670
 671                 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 672                 swap ^= kt;
 673                 cswap(x2, x3, swap);
 674                 cswap(z2, z3, swap);
 675                 swap = kt;
 676
 677                 /* obsolete
 678                 print_int("x2", x2);
 679                 print_int("z2", z2);
 680                 print_int("x3", x3);
 681                 print_int("z3", z3);
 682                 */
 683
 684                 f255_add(a, x2, z2);
 685                 f255_square(aa, a);
 686                 f255_sub(b, x2, z2);
 687                 f255_square(bb, b);
 688                 f255_sub(e, aa, bb);
 689                 f255_add(c, x3, z3);
 690                 f255_sub(d, x3, z3);
 691                 f255_mul(da, d, a);
 692                 f255_mul(cb, c, b);
 693
 694                 /* obsolete
 695                 print_int("a ", a);
 696                 print_int("aa", aa);
 697                 print_int("b ", b);
 698                 print_int("bb", bb);
 699                 print_int("e ", e);
 700                 print_int("c ", c);
 701                 print_int("d ", d);
 702                 print_int("da", da);
 703                 print_int("cb", cb);
 704                 */
 705
 706                 f255_add(x3, da, cb);
 707                 f255_square(x3, x3);
 708                 f255_sub(z3, da, cb);
 709                 f255_square(z3, z3);
 710                 f255_mul(z3, z3, x1);
 711                 f255_mul(x2, aa, bb);
 712                 f255_mul_a24(z2, e);
 713                 f255_add(z2, z2, aa);
 714                 f255_mul(z2, e, z2);
 715
 716                 /* obsolete
 717                 print_int("x2", x2);
 718                 print_int("z2", z2);
 719                 print_int("x3", x3);
 720                 print_int("z3", z3);
 721                 */
 722         }
 723         cswap(x2, x3, swap);
 724         cswap(z2, z3, swap);
 725
 726         /*
 727          * Inverse z2 with a modular exponentiation. This is a simple
 728          * square-and-multiply algorithm; we mutualise most non-squarings
 729          * since the exponent contains almost only ones.
 730          */
 731         memcpy(a, z2, sizeof z2);
 732         for (i = 0; i < 15; i ++) {
 733                 f255_square(a, a);
 734                 f255_mul(a, a, z2);
 735         }
 736         memcpy(b, a, sizeof a);
 737         for (i = 0; i < 14; i ++) {
 738                 int j;
 739
 740                 for (j = 0; j < 16; j ++) {
 741                         f255_square(b, b);
 742                 }
 743                 f255_mul(b, b, a);
 744         }
 745         for (i = 14; i >= 0; i --) {
 746                 f255_square(b, b);
 747                 if ((0xFFEB >> i) & 1) {
 748                         f255_mul(b, z2, b);
 749                 }
 750         }
 751         f255_mul(x2, x2, b);
 752         reduce_final_f255(x2);
 753         le30_to_le8(G, 32, x2);
 754         return 1;
 755 }
 756
 757 static size_t
 758 api_mulgen(unsigned char *R,
 759         const unsigned char *x, size_t xlen, int curve)
 760 {
 761         const unsigned char *G;
 762         size_t Glen;
 763
 764         G = api_generator(curve, &Glen);
 765         memcpy(R, G, Glen);
 766         api_mul(R, Glen, x, xlen, curve);
 767         return Glen;
 768 }
 769
 770 static uint32_t
 771 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 772         const unsigned char *x, size_t xlen,
 773         const unsigned char *y, size_t ylen, int curve)
 774 {
 775         /*
 776          * We don't implement this method, since it is used for ECDSA
 777          * only, and there is no ECDSA over Curve25519 (which instead
 778          * uses EdDSA).
 779          */
 780         (void)A;
 781         (void)B;
 782         (void)len;
 783         (void)x;
 784         (void)xlen;
 785         (void)y;
 786         (void)ylen;
 787         (void)curve;
 788         return 0;
 789 }
 790
 791 /* see bearssl_ec.h */
 792 const br_ec_impl br_ec_c25519_m31 = {
 793         (uint32_t)0x20000000,
 794         &api_generator,
 795         &api_order,
 796         &api_xoff,
 797         &api_mul,
 798         &api_mulgen,
 799         &api_muladd
 800 };