_ Git - BearSSL/blob - src/ec/ec_c25519_m31.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /* obsolete
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 static void
  31 print_int(const char *name, const uint32_t *x)
  32 {
  33         size_t u;
  34         unsigned char tmp[40];
  35
  36         printf("%s = ", name);
  37         for (u = 0; u < 9; u ++) {
  38                 if (x[u] > 0x3FFFFFFF) {
  39                         printf("INVALID:");
  40                         for (u = 0; u < 9; u ++) {
  41                                 printf(" %08X", x[u]);
  42                         }
  43                         printf("\n");
  44                         return;
  45                 }
  46         }
  47         memset(tmp, 0, sizeof tmp);
  48         for (u = 0; u < 9; u ++) {
  49                 uint64_t w;
  50                 int j, k;
  51
  52                 w = x[u];
  53                 j = 30 * (int)u;
  54                 k = j & 7;
  55                 if (k != 0) {
  56                         w <<= k;
  57                         j -= k;
  58                 }
  59                 k = j >> 3;
  60                 for (j = 0; j < 8; j ++) {
  61                         tmp[39 - k - j] |= (unsigned char)w;
  62                         w >>= 8;
  63                 }
  64         }
  65         for (u = 8; u < 40; u ++) {
  66                 printf("%02X", tmp[u]);
  67         }
  68         printf("\n");
  69 }
  70 */
  71
  72 /*
  73  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  74  * that right-shifting a signed negative integer copies the sign bit
  75  * (arithmetic right-shift). This is "implementation-defined behaviour",
  76  * i.e. it is not undefined, but it may differ between compilers. Each
  77  * compiler is supposed to document its behaviour in that respect. GCC
  78  * explicitly defines that an arithmetic right shift is used. We expect
  79  * all other compilers to do the same, because underlying CPU offer an
  80  * arithmetic right shift opcode that could not be used otherwise.
  81  */
  82 #if BR_NO_ARITH_SHIFT
  83 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  84                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  85 #else
  86 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  87 #endif
  88
  89 /*
  90  * Convert an integer from unsigned little-endian encoding to a sequence of
  91  * 30-bit words in little-endian order. The final "partial" word is
  92  * returned.
  93  */
  94 static uint32_t
  95 le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
  96 {
  97         uint32_t acc;
  98         int acc_len;
  99
 100         acc = 0;
 101         acc_len = 0;
 102         while (len -- > 0) {
 103                 uint32_t b;
 104
 105                 b = *src ++;
 106                 if (acc_len < 22) {
 107                         acc |= b << acc_len;
 108                         acc_len += 8;
 109                 } else {
 110                         *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
 111                         acc = b >> (30 - acc_len);
 112                         acc_len -= 22;
 113                 }
 114         }
 115         return acc;
 116 }
 117
 118 /*
 119  * Convert an integer (30-bit words, little-endian) to unsigned
 120  * little-endian encoding. The total encoding length is provided; all
 121  * the destination bytes will be filled.
 122  */
 123 static void
 124 le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
 125 {
 126         uint32_t acc;
 127         int acc_len;
 128
 129         acc = 0;
 130         acc_len = 0;
 131         while (len -- > 0) {
 132                 if (acc_len < 8) {
 133                         uint32_t w;
 134
 135                         w = *src ++;
 136                         *dst ++ = (unsigned char)(acc | (w << acc_len));
 137                         acc = w >> (8 - acc_len);
 138                         acc_len += 22;
 139                 } else {
 140                         *dst ++ = (unsigned char)acc;
 141                         acc >>= 8;
 142                         acc_len -= 8;
 143                 }
 144         }
 145 }
 146
 147 /*
 148  * Multiply two integers. Source integers are represented as arrays of
 149  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
 150  * 18 words of 30 bits each.
 151  */
 152 static void
 153 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
 154 {
 155         /*
 156          * Maximum intermediate result is no more than
 157          * 10376293531797946367, which fits in 64 bits. Reason:
 158          *
 159          *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
 160          *   10376293531797946367 < 9663676407 * 2^30
 161          *
 162          * Thus, adding together 9 products of 30-bit integers, with
 163          * a carry of at most 9663676406, yields an integer that fits
 164          * on 64 bits and generates a carry of at most 9663676406.
 165          */
 166         uint64_t t[17];
 167         uint64_t cc;
 168         int i;
 169
 170         t[ 0] = MUL31(a[0], b[0]);
 171         t[ 1] = MUL31(a[0], b[1])
 172                 + MUL31(a[1], b[0]);
 173         t[ 2] = MUL31(a[0], b[2])
 174                 + MUL31(a[1], b[1])
 175                 + MUL31(a[2], b[0]);
 176         t[ 3] = MUL31(a[0], b[3])
 177                 + MUL31(a[1], b[2])
 178                 + MUL31(a[2], b[1])
 179                 + MUL31(a[3], b[0]);
 180         t[ 4] = MUL31(a[0], b[4])
 181                 + MUL31(a[1], b[3])
 182                 + MUL31(a[2], b[2])
 183                 + MUL31(a[3], b[1])
 184                 + MUL31(a[4], b[0]);
 185         t[ 5] = MUL31(a[0], b[5])
 186                 + MUL31(a[1], b[4])
 187                 + MUL31(a[2], b[3])
 188                 + MUL31(a[3], b[2])
 189                 + MUL31(a[4], b[1])
 190                 + MUL31(a[5], b[0]);
 191         t[ 6] = MUL31(a[0], b[6])
 192                 + MUL31(a[1], b[5])
 193                 + MUL31(a[2], b[4])
 194                 + MUL31(a[3], b[3])
 195                 + MUL31(a[4], b[2])
 196                 + MUL31(a[5], b[1])
 197                 + MUL31(a[6], b[0]);
 198         t[ 7] = MUL31(a[0], b[7])
 199                 + MUL31(a[1], b[6])
 200                 + MUL31(a[2], b[5])
 201                 + MUL31(a[3], b[4])
 202                 + MUL31(a[4], b[3])
 203                 + MUL31(a[5], b[2])
 204                 + MUL31(a[6], b[1])
 205                 + MUL31(a[7], b[0]);
 206         t[ 8] = MUL31(a[0], b[8])
 207                 + MUL31(a[1], b[7])
 208                 + MUL31(a[2], b[6])
 209                 + MUL31(a[3], b[5])
 210                 + MUL31(a[4], b[4])
 211                 + MUL31(a[5], b[3])
 212                 + MUL31(a[6], b[2])
 213                 + MUL31(a[7], b[1])
 214                 + MUL31(a[8], b[0]);
 215         t[ 9] = MUL31(a[1], b[8])
 216                 + MUL31(a[2], b[7])
 217                 + MUL31(a[3], b[6])
 218                 + MUL31(a[4], b[5])
 219                 + MUL31(a[5], b[4])
 220                 + MUL31(a[6], b[3])
 221                 + MUL31(a[7], b[2])
 222                 + MUL31(a[8], b[1]);
 223         t[10] = MUL31(a[2], b[8])
 224                 + MUL31(a[3], b[7])
 225                 + MUL31(a[4], b[6])
 226                 + MUL31(a[5], b[5])
 227                 + MUL31(a[6], b[4])
 228                 + MUL31(a[7], b[3])
 229                 + MUL31(a[8], b[2]);
 230         t[11] = MUL31(a[3], b[8])
 231                 + MUL31(a[4], b[7])
 232                 + MUL31(a[5], b[6])
 233                 + MUL31(a[6], b[5])
 234                 + MUL31(a[7], b[4])
 235                 + MUL31(a[8], b[3]);
 236         t[12] = MUL31(a[4], b[8])
 237                 + MUL31(a[5], b[7])
 238                 + MUL31(a[6], b[6])
 239                 + MUL31(a[7], b[5])
 240                 + MUL31(a[8], b[4]);
 241         t[13] = MUL31(a[5], b[8])
 242                 + MUL31(a[6], b[7])
 243                 + MUL31(a[7], b[6])
 244                 + MUL31(a[8], b[5]);
 245         t[14] = MUL31(a[6], b[8])
 246                 + MUL31(a[7], b[7])
 247                 + MUL31(a[8], b[6]);
 248         t[15] = MUL31(a[7], b[8])
 249                 + MUL31(a[8], b[7]);
 250         t[16] = MUL31(a[8], b[8]);
 251
 252         /*
 253          * Propagate carries.
 254          */
 255         cc = 0;
 256         for (i = 0; i < 17; i ++) {
 257                 uint64_t w;
 258
 259                 w = t[i] + cc;
 260                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 261                 cc = w >> 30;
 262         }
 263         d[17] = (uint32_t)cc;
 264 }
 265
 266 /*
 267  * Square a 270-bit integer, represented as an array of nine 30-bit words.
 268  * Result uses 18 words of 30 bits each.
 269  */
 270 static void
 271 square9(uint32_t *d, const uint32_t *a)
 272 {
 273         uint64_t t[17];
 274         uint64_t cc;
 275         int i;
 276
 277         t[ 0] = MUL31(a[0], a[0]);
 278         t[ 1] = ((MUL31(a[0], a[1])) << 1);
 279         t[ 2] = MUL31(a[1], a[1])
 280                 + ((MUL31(a[0], a[2])) << 1);
 281         t[ 3] = ((MUL31(a[0], a[3])
 282                 + MUL31(a[1], a[2])) << 1);
 283         t[ 4] = MUL31(a[2], a[2])
 284                 + ((MUL31(a[0], a[4])
 285                 + MUL31(a[1], a[3])) << 1);
 286         t[ 5] = ((MUL31(a[0], a[5])
 287                 + MUL31(a[1], a[4])
 288                 + MUL31(a[2], a[3])) << 1);
 289         t[ 6] = MUL31(a[3], a[3])
 290                 + ((MUL31(a[0], a[6])
 291                 + MUL31(a[1], a[5])
 292                 + MUL31(a[2], a[4])) << 1);
 293         t[ 7] = ((MUL31(a[0], a[7])
 294                 + MUL31(a[1], a[6])
 295                 + MUL31(a[2], a[5])
 296                 + MUL31(a[3], a[4])) << 1);
 297         t[ 8] = MUL31(a[4], a[4])
 298                 + ((MUL31(a[0], a[8])
 299                 + MUL31(a[1], a[7])
 300                 + MUL31(a[2], a[6])
 301                 + MUL31(a[3], a[5])) << 1);
 302         t[ 9] = ((MUL31(a[1], a[8])
 303                 + MUL31(a[2], a[7])
 304                 + MUL31(a[3], a[6])
 305                 + MUL31(a[4], a[5])) << 1);
 306         t[10] = MUL31(a[5], a[5])
 307                 + ((MUL31(a[2], a[8])
 308                 + MUL31(a[3], a[7])
 309                 + MUL31(a[4], a[6])) << 1);
 310         t[11] = ((MUL31(a[3], a[8])
 311                 + MUL31(a[4], a[7])
 312                 + MUL31(a[5], a[6])) << 1);
 313         t[12] = MUL31(a[6], a[6])
 314                 + ((MUL31(a[4], a[8])
 315                 + MUL31(a[5], a[7])) << 1);
 316         t[13] = ((MUL31(a[5], a[8])
 317                 + MUL31(a[6], a[7])) << 1);
 318         t[14] = MUL31(a[7], a[7])
 319                 + ((MUL31(a[6], a[8])) << 1);
 320         t[15] = ((MUL31(a[7], a[8])) << 1);
 321         t[16] = MUL31(a[8], a[8]);
 322
 323         /*
 324          * Propagate carries.
 325          */
 326         cc = 0;
 327         for (i = 0; i < 17; i ++) {
 328                 uint64_t w;
 329
 330                 w = t[i] + cc;
 331                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 332                 cc = w >> 30;
 333         }
 334         d[17] = (uint32_t)cc;
 335 }
 336
 337 /*
 338  * Perform a "final reduction" in field F255 (field for Curve25519)
 339  * The source value must be less than twice the modulus. If the value
 340  * is not lower than the modulus, then the modulus is subtracted and
 341  * this function returns 1; otherwise, it leaves it untouched and it
 342  * returns 0.
 343  */
 344 static uint32_t
 345 reduce_final_f255(uint32_t *d)
 346 {
 347         uint32_t t[9];
 348         uint32_t cc;
 349         int i;
 350
 351         memcpy(t, d, sizeof t);
 352         cc = 19;
 353         for (i = 0; i < 9; i ++) {
 354                 uint32_t w;
 355
 356                 w = t[i] + cc;
 357                 cc = w >> 30;
 358                 t[i] = w & 0x3FFFFFFF;
 359         }
 360         cc = t[8] >> 15;
 361         t[8] &= 0x7FFF;
 362         CCOPY(cc, d, t, sizeof t);
 363         return cc;
 364 }
 365
 366 /*
 367  * Perform a multiplication of two integers modulo 2^255-19.
 368  * Operands are arrays of 9 words, each containing 30 bits of data, in
 369  * little-endian order. Input value may be up to 2^256-1; on output, value
 370  * fits on 256 bits and is lower than twice the modulus.
 371  */
 372 static void
 373 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
 374 {
 375         uint32_t t[18];
 376         uint64_t cc, w;
 377         int i;
 378
 379         /*
 380          * Compute raw multiplication. All result words fit in 30 bits
 381          * each; upper word (t[17]) must fit on 2 bits, since the product
 382          * of two 256-bit integers must fit on 512 bits.
 383          */
 384         mul9(t, a, b);
 385
 386         /*
 387          * Modular reduction: each high word is added where necessary.
 388          * Since the modulus is 2^255-19 and word 9 corresponds to
 389          * offset 9*30 = 270, word 9+k must be added to word k with
 390          * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
 391          * added that way.
 392          */
 393         cc = MUL31(t[8] >> 15, 19);
 394         t[8] &= 0x7FFF;
 395         for (i = 0; i < 9; i ++) {
 396                 w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
 397                 t[i] = (uint32_t)w & 0x3FFFFFFF;
 398                 cc = w >> 30;
 399         }
 400         cc = MUL31(w >> 15, 19);
 401         t[8] &= 0x7FFF;
 402         for (i = 0; i < 9; i ++) {
 403                 w = t[i] + cc;
 404                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 405                 cc = w >> 30;
 406         }
 407 }
 408
 409 /*
 410  * Perform a squaring of an integer modulo 2^255-19.
 411  * Operands are arrays of 9 words, each containing 30 bits of data, in
 412  * little-endian order. Input value may be up to 2^256-1; on output, value
 413  * fits on 256 bits and is lower than twice the modulus.
 414  */
 415 static void
 416 f255_square(uint32_t *d, const uint32_t *a)
 417 {
 418         uint32_t t[18];
 419         uint64_t cc, w;
 420         int i;
 421
 422         /*
 423          * Compute raw squaring. All result words fit in 30 bits
 424          * each; upper word (t[17]) must fit on 2 bits, since the square
 425          * of a 256-bit integers must fit on 512 bits.
 426          */
 427         square9(t, a);
 428
 429         /*
 430          * Modular reduction: each high word is added where necessary.
 431          * Since the modulus is 2^255-19 and word 9 corresponds to
 432          * offset 9*30 = 270, word 9+k must be added to word k with
 433          * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
 434          * added that way.
 435          */
 436         cc = MUL31(t[8] >> 15, 19);
 437         t[8] &= 0x7FFF;
 438         for (i = 0; i < 9; i ++) {
 439                 w = (uint64_t)t[i] + cc + MUL31(t[i + 9], 622592);
 440                 t[i] = (uint32_t)w & 0x3FFFFFFF;
 441                 cc = w >> 30;
 442         }
 443         cc = MUL31(w >> 15, 19);
 444         t[8] &= 0x7FFF;
 445         for (i = 0; i < 9; i ++) {
 446                 w = t[i] + cc;
 447                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 448                 cc = w >> 30;
 449         }
 450 }
 451
 452 /*
 453  * Add two values in F255. Partial reduction is performed (down to less
 454  * than twice the modulus).
 455  */
 456 static void
 457 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
 458 {
 459         /*
 460          * Since operand words fit on 30 bits, we can use 32-bit
 461          * variables throughout.
 462          */
 463         int i;
 464         uint32_t cc, w;
 465
 466         cc = 0;
 467         for (i = 0; i < 9; i ++) {
 468                 w = a[i] + b[i] + cc;
 469                 d[i] = w & 0x3FFFFFFF;
 470                 cc = w >> 30;
 471         }
 472         cc = MUL15(w >> 15, 19);
 473         d[8] &= 0x7FFF;
 474         for (i = 0; i < 9; i ++) {
 475                 w = d[i] + cc;
 476                 d[i] = w & 0x3FFFFFFF;
 477                 cc = w >> 30;
 478         }
 479 }
 480
 481 /*
 482  * Subtract one value from another in F255. Partial reduction is
 483  * performed (down to less than twice the modulus).
 484  */
 485 static void
 486 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
 487 {
 488         /*
 489          * We actually compute a - b + 2*p, so that the final value is
 490          * necessarily positive.
 491          */
 492         int i;
 493         uint32_t cc, w;
 494
 495         cc = (uint32_t)-38;
 496         for (i = 0; i < 9; i ++) {
 497                 w = a[i] - b[i] + cc;
 498                 d[i] = w & 0x3FFFFFFF;
 499                 cc = ARSH(w, 30);
 500         }
 501         cc = MUL15((w + 0x10000) >> 15, 19);
 502         d[8] &= 0x7FFF;
 503         for (i = 0; i < 9; i ++) {
 504                 w = d[i] + cc;
 505                 d[i] = w & 0x3FFFFFFF;
 506                 cc = w >> 30;
 507         }
 508 }
 509
 510 /*
 511  * Multiply an integer by the 'A24' constant (121665). Partial reduction
 512  * is performed (down to less than twice the modulus).
 513  */
 514 static void
 515 f255_mul_a24(uint32_t *d, const uint32_t *a)
 516 {
 517         int i;
 518         uint64_t cc, w;
 519
 520         cc = 0;
 521         for (i = 0; i < 9; i ++) {
 522                 w = MUL31(a[i], 121665) + cc;
 523                 d[i] = (uint32_t)w & 0x3FFFFFFF;
 524                 cc = w >> 30;
 525         }
 526         cc = MUL31((uint32_t)(w >> 15), 19);
 527         d[8] &= 0x7FFF;
 528         for (i = 0; i < 9; i ++) {
 529                 w = (uint64_t)d[i] + cc;
 530                 d[i] = w & 0x3FFFFFFF;
 531                 cc = w >> 30;
 532         }
 533 }
 534
 535 static const unsigned char GEN[] = {
 536         0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 537         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 538         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 539         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 540 };
 541
 542 static const unsigned char ORDER[] = {
 543         0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 544         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 545         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 546         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 547 };
 548
 549 static const unsigned char *
 550 api_generator(int curve, size_t *len)
 551 {
 552         (void)curve;
 553         *len = 32;
 554         return GEN;
 555 }
 556
 557 static const unsigned char *
 558 api_order(int curve, size_t *len)
 559 {
 560         (void)curve;
 561         *len = 32;
 562         return ORDER;
 563 }
 564
 565 static size_t
 566 api_xoff(int curve, size_t *len)
 567 {
 568         (void)curve;
 569         *len = 32;
 570         return 0;
 571 }
 572
 573 static void
 574 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
 575 {
 576         int i;
 577
 578         ctl = -ctl;
 579         for (i = 0; i < 9; i ++) {
 580                 uint32_t aw, bw, tw;
 581
 582                 aw = a[i];
 583                 bw = b[i];
 584                 tw = ctl & (aw ^ bw);
 585                 a[i] = aw ^ tw;
 586                 b[i] = bw ^ tw;
 587         }
 588 }
 589
 590 static uint32_t
 591 api_mul(unsigned char *G, size_t Glen,
 592         const unsigned char *kb, size_t kblen, int curve)
 593 {
 594         uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
 595         uint32_t a[9], aa[9], b[9], bb[9];
 596         uint32_t c[9], d[9], e[9], da[9], cb[9];
 597         unsigned char k[32];
 598         uint32_t swap;
 599         int i;
 600
 601         (void)curve;
 602
 603         /*
 604          * Points are encoded over exactly 32 bytes. Multipliers must fit
 605          * in 32 bytes as well.
 606          * RFC 7748 mandates that the high bit of the last point byte must
 607          * be ignored/cleared.
 608          */
 609         if (Glen != 32 || kblen > 32) {
 610                 return 0;
 611         }
 612         G[31] &= 0x7F;
 613
 614         /*
 615          * Initialise variables x1, x2, z2, x3 and z3. We set all of them
 616          * into Montgomery representation.
 617          */
 618         x1[8] = le8_to_le30(x1, G, 32);
 619         memcpy(x3, x1, sizeof x1);
 620         memset(z2, 0, sizeof z2);
 621         memset(x2, 0, sizeof x2);
 622         x2[0] = 1;
 623         memset(z3, 0, sizeof z3);
 624         z3[0] = 1;
 625
 626         memcpy(k, kb, kblen);
 627         memset(k + kblen, 0, (sizeof k) - kblen);
 628         k[0] &= 0xF8;
 629         k[31] &= 0x7F;
 630         k[31] |= 0x40;
 631
 632         /* obsolete
 633         print_int("x1", x1);
 634         */
 635
 636         swap = 0;
 637         for (i = 254; i >= 0; i --) {
 638                 uint32_t kt;
 639
 640                 kt = (k[i >> 3] >> (i & 7)) & 1;
 641                 swap ^= kt;
 642                 cswap(x2, x3, swap);
 643                 cswap(z2, z3, swap);
 644                 swap = kt;
 645
 646                 /* obsolete
 647                 print_int("x2", x2);
 648                 print_int("z2", z2);
 649                 print_int("x3", x3);
 650                 print_int("z3", z3);
 651                 */
 652
 653                 f255_add(a, x2, z2);
 654                 f255_square(aa, a);
 655                 f255_sub(b, x2, z2);
 656                 f255_square(bb, b);
 657                 f255_sub(e, aa, bb);
 658                 f255_add(c, x3, z3);
 659                 f255_sub(d, x3, z3);
 660                 f255_mul(da, d, a);
 661                 f255_mul(cb, c, b);
 662
 663                 /* obsolete
 664                 print_int("a ", a);
 665                 print_int("aa", aa);
 666                 print_int("b ", b);
 667                 print_int("bb", bb);
 668                 print_int("e ", e);
 669                 print_int("c ", c);
 670                 print_int("d ", d);
 671                 print_int("da", da);
 672                 print_int("cb", cb);
 673                 */
 674
 675                 f255_add(x3, da, cb);
 676                 f255_square(x3, x3);
 677                 f255_sub(z3, da, cb);
 678                 f255_square(z3, z3);
 679                 f255_mul(z3, z3, x1);
 680                 f255_mul(x2, aa, bb);
 681                 f255_mul_a24(z2, e);
 682                 f255_add(z2, z2, aa);
 683                 f255_mul(z2, e, z2);
 684
 685                 /* obsolete
 686                 print_int("x2", x2);
 687                 print_int("z2", z2);
 688                 print_int("x3", x3);
 689                 print_int("z3", z3);
 690                 */
 691         }
 692         cswap(x2, x3, swap);
 693         cswap(z2, z3, swap);
 694
 695         /*
 696          * Inverse z2 with a modular exponentiation. This is a simple
 697          * square-and-multiply algorithm; we mutualise most non-squarings
 698          * since the exponent contains almost only ones.
 699          */
 700         memcpy(a, z2, sizeof z2);
 701         for (i = 0; i < 15; i ++) {
 702                 f255_square(a, a);
 703                 f255_mul(a, a, z2);
 704         }
 705         memcpy(b, a, sizeof a);
 706         for (i = 0; i < 14; i ++) {
 707                 int j;
 708
 709                 for (j = 0; j < 16; j ++) {
 710                         f255_square(b, b);
 711                 }
 712                 f255_mul(b, b, a);
 713         }
 714         for (i = 14; i >= 0; i --) {
 715                 f255_square(b, b);
 716                 if ((0xFFEB >> i) & 1) {
 717                         f255_mul(b, z2, b);
 718                 }
 719         }
 720         f255_mul(x2, x2, b);
 721         reduce_final_f255(x2);
 722         le30_to_le8(G, 32, x2);
 723         return 1;
 724 }
 725
 726 static size_t
 727 api_mulgen(unsigned char *R,
 728         const unsigned char *x, size_t xlen, int curve)
 729 {
 730         const unsigned char *G;
 731         size_t Glen;
 732
 733         G = api_generator(curve, &Glen);
 734         memcpy(R, G, Glen);
 735         api_mul(R, Glen, x, xlen, curve);
 736         return Glen;
 737 }
 738
 739 static uint32_t
 740 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 741         const unsigned char *x, size_t xlen,
 742         const unsigned char *y, size_t ylen, int curve)
 743 {
 744         /*
 745          * We don't implement this method, since it is used for ECDSA
 746          * only, and there is no ECDSA over Curve25519 (which instead
 747          * uses EdDSA).
 748          */
 749         (void)A;
 750         (void)B;
 751         (void)len;
 752         (void)x;
 753         (void)xlen;
 754         (void)y;
 755         (void)ylen;
 756         (void)curve;
 757         return 0;
 758 }
 759
 760 /* see bearssl_ec.h */
 761 const br_ec_impl br_ec_c25519_m31 = {
 762         (uint32_t)0x20000000,
 763         &api_generator,
 764         &api_order,
 765         &api_xoff,
 766         &api_mul,
 767         &api_mulgen,
 768         &api_muladd
 769 };