_ Git - BearSSL/blob - src/ec/ec_c25519_m64.c

   1 /*
   2  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 #if BR_INT128 || BR_UMUL128
  28
  29 #if BR_UMUL128
  30 #include <intrin.h>
  31 #endif
  32
  33 static const unsigned char GEN[] = {
  34         0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  35         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  36         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  37         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  38 };
  39
  40 static const unsigned char ORDER[] = {
  41         0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  42         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  43         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  44         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
  45 };
  46
  47 static const unsigned char *
  48 api_generator(int curve, size_t *len)
  49 {
  50         (void)curve;
  51         *len = 32;
  52         return GEN;
  53 }
  54
  55 static const unsigned char *
  56 api_order(int curve, size_t *len)
  57 {
  58         (void)curve;
  59         *len = 32;
  60         return ORDER;
  61 }
  62
  63 static size_t
  64 api_xoff(int curve, size_t *len)
  65 {
  66         (void)curve;
  67         *len = 32;
  68         return 0;
  69 }
  70
  71 /*
  72  * A field element is encoded as four 64-bit integers, in basis 2^63.
  73  * Operations return partially reduced values, which may range up to
  74  * 2^255+37.
  75  */
  76
  77 #define MASK63   (((uint64_t)1 << 63) - (uint64_t)1)
  78
  79 /*
  80  * Swap two field elements, conditionally on a flag.
  81  */
  82 static inline void
  83 f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
  84 {
  85         uint64_t m, w;
  86
  87         m = -(uint64_t)ctl;
  88         w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
  89         w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
  90         w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
  91         w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
  92 }
  93
  94 /*
  95  * Addition in the field.
  96  */
  97 static inline void
  98 f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
  99 {
 100 #if BR_INT128
 101
 102         uint64_t t0, t1, t2, t3, cc;
 103         unsigned __int128 z;
 104
 105         z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
 106         t0 = (uint64_t)z;
 107         z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
 108         t1 = (uint64_t)z;
 109         z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
 110         t2 = (uint64_t)z;
 111         z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
 112         t3 = (uint64_t)z & MASK63;
 113         cc = (uint64_t)(z >> 63);
 114
 115         /*
 116          * Since operands are at most 2^255+37, the sum is at most
 117          * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
 118          *
 119          * We use: 2^255 = 19 mod p.
 120          * Since we add 0, 19 or 38 to a value that fits on 255 bits,
 121          * the result is at most 2^255+37.
 122          */
 123         z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
 124         d[0] = (uint64_t)z;
 125         z = (unsigned __int128)t1 + (z >> 64);
 126         d[1] = (uint64_t)z;
 127         z = (unsigned __int128)t2 + (z >> 64);
 128         d[2] = (uint64_t)z;
 129         d[3] = t3 + (uint64_t)(z >> 64);
 130
 131 #elif BR_UMUL128
 132
 133         uint64_t t0, t1, t2, t3, cc;
 134         unsigned char k;
 135
 136         k = _addcarry_u64(0, a[0], b[0], &t0);
 137         k = _addcarry_u64(k, a[1], b[1], &t1);
 138         k = _addcarry_u64(k, a[2], b[2], &t2);
 139         k = _addcarry_u64(k, a[3], b[3], &t3);
 140         cc = (k << 1) + (t3 >> 63);
 141         t3 &= MASK63;
 142
 143         /*
 144          * Since operands are at most 2^255+37, the sum is at most
 145          * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
 146          *
 147          * We use: 2^255 = 19 mod p.
 148          * Since we add 0, 19 or 38 to a value that fits on 255 bits,
 149          * the result is at most 2^255+37.
 150          */
 151         k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
 152         k = _addcarry_u64(k, t1, 0, &d[1]);
 153         k = _addcarry_u64(k, t2, 0, &d[2]);
 154         (void)_addcarry_u64(k, t3, 0, &d[3]);
 155
 156 #endif
 157 }
 158
 159 /*
 160  * Subtraction.
 161  * On input, limbs must fit on 60 bits each. On output, result is
 162  * partially reduced, with max value 2^255+19456; moreover, all
 163  * limbs will fit on 51 bits, except the low limb, which may have
 164  * value up to 2^51+19455.
 165  */
 166 static inline void
 167 f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
 168 {
 169 #if BR_INT128
 170
 171         /*
 172          * We compute t = 2^256 - 38 + a - b, which is necessarily
 173          * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
 174          * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
 175          * on the two upper bits of t (bits 255 and 256).
 176          */
 177
 178         uint64_t t0, t1, t2, t3, t4, cc;
 179         unsigned __int128 z;
 180
 181         z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
 182         t0 = (uint64_t)z;
 183         cc = -(uint64_t)(z >> 64);
 184         z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
 185                 - (unsigned __int128)cc;
 186         t1 = (uint64_t)z;
 187         cc = -(uint64_t)(z >> 64);
 188         z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
 189                 - (unsigned __int128)cc;
 190         t2 = (uint64_t)z;
 191         cc = -(uint64_t)(z >> 64);
 192         z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
 193                 - (unsigned __int128)cc;
 194         t3 = (uint64_t)z;
 195         t4 = 1 + (uint64_t)(z >> 64);
 196
 197         /*
 198          * We have a 257-bit result. The two top bits can be 00, 01 or 10,
 199          * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
 200          * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
 201          * This guarantees that the result is at most 2^255+37.
 202          */
 203         cc = (38 & -t4) + (19 & -(t3 >> 63));
 204         t3 &= MASK63;
 205         z = (unsigned __int128)t0 + (unsigned __int128)cc;
 206         d[0] = (uint64_t)z;
 207         z = (unsigned __int128)t1 + (z >> 64);
 208         d[1] = (uint64_t)z;
 209         z = (unsigned __int128)t2 + (z >> 64);
 210         d[2] = (uint64_t)z;
 211         d[3] = t3 + (uint64_t)(z >> 64);
 212
 213 #elif BR_UMUL128
 214
 215         /*
 216          * We compute t = 2^256 - 38 + a - b, which is necessarily
 217          * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
 218          * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
 219          * on the two upper bits of t (bits 255 and 256).
 220          */
 221
 222         uint64_t t0, t1, t2, t3, t4;
 223         unsigned char k;
 224
 225         k = _subborrow_u64(0, a[0], b[0], &t0);
 226         k = _subborrow_u64(k, a[1], b[1], &t1);
 227         k = _subborrow_u64(k, a[2], b[2], &t2);
 228         k = _subborrow_u64(k, a[3], b[3], &t3);
 229         (void)_subborrow_u64(k, 1, 0, &t4);
 230
 231         k = _subborrow_u64(0, t0, 38, &t0);
 232         k = _subborrow_u64(k, t1, 0, &t1);
 233         k = _subborrow_u64(k, t2, 0, &t2);
 234         k = _subborrow_u64(k, t3, 0, &t3);
 235         (void)_subborrow_u64(k, t4, 0, &t4);
 236
 237         /*
 238          * We have a 257-bit result. The two top bits can be 00, 01 or 10,
 239          * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
 240          * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
 241          * This guarantees that the result is at most 2^255+37.
 242          */
 243         t4 = (38 & -t4) + (19 & -(t3 >> 63));
 244         t3 &= MASK63;
 245         k = _addcarry_u64(0, t0, t4, &d[0]);
 246         k = _addcarry_u64(k, t1, 0, &d[1]);
 247         k = _addcarry_u64(k, t2, 0, &d[2]);
 248         (void)_addcarry_u64(k, t3, 0, &d[3]);
 249
 250 #endif
 251 }
 252
 253 /*
 254  * Multiplication.
 255  */
 256 static inline void
 257 f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
 258 {
 259 #if BR_INT128
 260
 261         unsigned __int128 z;
 262         uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
 263
 264         /*
 265          * Compute the product a*b over plain integers.
 266          */
 267         z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
 268         t0 = (uint64_t)z;
 269         z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
 270         t1 = (uint64_t)z;
 271         z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
 272         t2 = (uint64_t)z;
 273         z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
 274         t3 = (uint64_t)z;
 275         t4 = (uint64_t)(z >> 64);
 276
 277         z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
 278                 + (unsigned __int128)t1;
 279         t1 = (uint64_t)z;
 280         z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
 281                 + (unsigned __int128)t2 + (z >> 64);
 282         t2 = (uint64_t)z;
 283         z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
 284                 + (unsigned __int128)t3 + (z >> 64);
 285         t3 = (uint64_t)z;
 286         z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
 287                 + (unsigned __int128)t4 + (z >> 64);
 288         t4 = (uint64_t)z;
 289         t5 = (uint64_t)(z >> 64);
 290
 291         z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
 292                 + (unsigned __int128)t2;
 293         t2 = (uint64_t)z;
 294         z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
 295                 + (unsigned __int128)t3 + (z >> 64);
 296         t3 = (uint64_t)z;
 297         z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
 298                 + (unsigned __int128)t4 + (z >> 64);
 299         t4 = (uint64_t)z;
 300         z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
 301                 + (unsigned __int128)t5 + (z >> 64);
 302         t5 = (uint64_t)z;
 303         t6 = (uint64_t)(z >> 64);
 304
 305         z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
 306                 + (unsigned __int128)t3;
 307         t3 = (uint64_t)z;
 308         z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
 309                 + (unsigned __int128)t4 + (z >> 64);
 310         t4 = (uint64_t)z;
 311         z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
 312                 + (unsigned __int128)t5 + (z >> 64);
 313         t5 = (uint64_t)z;
 314         z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
 315                 + (unsigned __int128)t6 + (z >> 64);
 316         t6 = (uint64_t)z;
 317         t7 = (uint64_t)(z >> 64);
 318
 319         /*
 320          * Modulo p, we have:
 321          *
 322          *   2^255 = 19
 323          *   2^510 = 19*19 = 361
 324          *
 325          * We split the intermediate t into three parts, in basis
 326          * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
 327          * The upper one can only be a single bit (th), since the
 328          * multiplication operands are at most 2^255+37 each.
 329          */
 330         th = t7 >> 62;
 331         t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
 332         t6 = (t6 << 1) | (t5 >> 63);
 333         t5 = (t5 << 1) | (t4 >> 63);
 334         t4 = (t4 << 1) | (t3 >> 63);
 335         t3 &= MASK63;
 336
 337         /*
 338          * Multiply the middle part (t4..t7) by 19. We truncate it to
 339          * 255 bits; the extra bits will go along with th.
 340          */
 341         z = (unsigned __int128)t4 * 19;
 342         t4 = (uint64_t)z;
 343         z = (unsigned __int128)t5 * 19 + (z >> 64);
 344         t5 = (uint64_t)z;
 345         z = (unsigned __int128)t6 * 19 + (z >> 64);
 346         t6 = (uint64_t)z;
 347         z = (unsigned __int128)t7 * 19 + (z >> 64);
 348         t7 = (uint64_t)z & MASK63;
 349
 350         th = (361 & -th) + (19 * (uint64_t)(z >> 63));
 351
 352         /*
 353          * Add elements together.
 354          * At this point:
 355          *   t0..t3 fits on 255 bits.
 356          *   t4..t7 fits on 255 bits.
 357          *   th <= 361 + 342 = 703.
 358          */
 359         z = (unsigned __int128)t0 + (unsigned __int128)t4
 360                 + (unsigned __int128)th;
 361         t0 = (uint64_t)z;
 362         z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
 363         t1 = (uint64_t)z;
 364         z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
 365         t2 = (uint64_t)z;
 366         z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
 367         t3 = (uint64_t)z & MASK63;
 368         th = (uint64_t)(z >> 63);
 369
 370         /*
 371          * Since the sum is at most 2^256 + 703, the two upper bits, in th,
 372          * can only have value 0, 1 or 2. We just add th*19, which
 373          * guarantees a result of at most 2^255+37.
 374          */
 375         z = (unsigned __int128)t0 + (19 * th);
 376         d[0] = (uint64_t)z;
 377         z = (unsigned __int128)t1 + (z >> 64);
 378         d[1] = (uint64_t)z;
 379         z = (unsigned __int128)t2 + (z >> 64);
 380         d[2] = (uint64_t)z;
 381         d[3] = t3 + (uint64_t)(z >> 64);
 382
 383 #elif BR_UMUL128
 384
 385         uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
 386         uint64_t h0, h1, h2, h3;
 387         unsigned char k;
 388
 389         /*
 390          * Compute the product a*b over plain integers.
 391          */
 392         t0 = _umul128(a[0], b[0], &h0);
 393         t1 = _umul128(a[0], b[1], &h1);
 394         k = _addcarry_u64(0, t1, h0, &t1);
 395         t2 = _umul128(a[0], b[2], &h2);
 396         k = _addcarry_u64(k, t2, h1, &t2);
 397         t3 = _umul128(a[0], b[3], &h3);
 398         k = _addcarry_u64(k, t3, h2, &t3);
 399         (void)_addcarry_u64(k, h3, 0, &t4);
 400
 401         k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
 402         k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
 403         k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
 404         k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
 405         t5 = k;
 406         k = _addcarry_u64(0, t2, h0, &t2);
 407         k = _addcarry_u64(k, t3, h1, &t3);
 408         k = _addcarry_u64(k, t4, h2, &t4);
 409         (void)_addcarry_u64(k, t5, h3, &t5);
 410
 411         k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
 412         k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
 413         k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
 414         k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
 415         t6 = k;
 416         k = _addcarry_u64(0, t3, h0, &t3);
 417         k = _addcarry_u64(k, t4, h1, &t4);
 418         k = _addcarry_u64(k, t5, h2, &t5);
 419         (void)_addcarry_u64(k, t6, h3, &t6);
 420
 421         k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
 422         k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
 423         k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
 424         k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
 425         t7 = k;
 426         k = _addcarry_u64(0, t4, h0, &t4);
 427         k = _addcarry_u64(k, t5, h1, &t5);
 428         k = _addcarry_u64(k, t6, h2, &t6);
 429         (void)_addcarry_u64(k, t7, h3, &t7);
 430
 431         /*
 432          * Modulo p, we have:
 433          *
 434          *   2^255 = 19
 435          *   2^510 = 19*19 = 361
 436          *
 437          * We split the intermediate t into three parts, in basis
 438          * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
 439          * The upper one can only be a single bit (th), since the
 440          * multiplication operands are at most 2^255+37 each.
 441          */
 442         th = t7 >> 62;
 443         t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
 444         t6 = (t6 << 1) | (t5 >> 63);
 445         t5 = (t5 << 1) | (t4 >> 63);
 446         t4 = (t4 << 1) | (t3 >> 63);
 447         t3 &= MASK63;
 448
 449         /*
 450          * Multiply the middle part (t4..t7) by 19. We truncate it to
 451          * 255 bits; the extra bits will go along with th.
 452          */
 453         t4 = _umul128(t4, 19, &h0);
 454         t5 = _umul128(t5, 19, &h1);
 455         t6 = _umul128(t6, 19, &h2);
 456         t7 = _umul128(t7, 19, &h3);
 457         k = _addcarry_u64(0, t5, h0, &t5);
 458         k = _addcarry_u64(k, t6, h1, &t6);
 459         k = _addcarry_u64(k, t7, h2, &t7);
 460         (void)_addcarry_u64(k, h3, 0, &h3);
 461         th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
 462         t7 &= MASK63;
 463
 464         /*
 465          * Add elements together.
 466          * At this point:
 467          *   t0..t3 fits on 255 bits.
 468          *   t4..t7 fits on 255 bits.
 469          *   th <= 361 + 342 = 703.
 470          */
 471         k = _addcarry_u64(0, t0, t4, &t0);
 472         k = _addcarry_u64(k, t1, t5, &t1);
 473         k = _addcarry_u64(k, t2, t6, &t2);
 474         k = _addcarry_u64(k, t3, t7, &t3);
 475         t4 = k;
 476         k = _addcarry_u64(0, t0, th, &t0);
 477         k = _addcarry_u64(k, t1, 0, &t1);
 478         k = _addcarry_u64(k, t2, 0, &t2);
 479         k = _addcarry_u64(k, t3, 0, &t3);
 480         (void)_addcarry_u64(k, t4, 0, &t4);
 481
 482         th = (t4 << 1) + (t3 >> 63);
 483         t3 &= MASK63;
 484
 485         /*
 486          * Since the sum is at most 2^256 + 703, the two upper bits, in th,
 487          * can only have value 0, 1 or 2. We just add th*19, which
 488          * guarantees a result of at most 2^255+37.
 489          */
 490         k = _addcarry_u64(0, t0, 19 * th, &d[0]);
 491         k = _addcarry_u64(k, t1, 0, &d[1]);
 492         k = _addcarry_u64(k, t2, 0, &d[2]);
 493         (void)_addcarry_u64(k, t3, 0, &d[3]);
 494
 495 #endif
 496 }
 497
 498 /*
 499  * Multiplication by A24 = 121665.
 500  */
 501 static inline void
 502 f255_mul_a24(uint64_t *d, const uint64_t *a)
 503 {
 504 #if BR_INT128
 505
 506         uint64_t t0, t1, t2, t3;
 507         unsigned __int128 z;
 508
 509         z = (unsigned __int128)a[0] * 121665;
 510         t0 = (uint64_t)z;
 511         z = (unsigned __int128)a[1] * 121665 + (z >> 64);
 512         t1 = (uint64_t)z;
 513         z = (unsigned __int128)a[2] * 121665 + (z >> 64);
 514         t2 = (uint64_t)z;
 515         z = (unsigned __int128)a[3] * 121665 + (z >> 64);
 516         t3 = (uint64_t)z & MASK63;
 517
 518         z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
 519         t0 = (uint64_t)z;
 520         z = (unsigned __int128)t1 + (z >> 64);
 521         t1 = (uint64_t)z;
 522         z = (unsigned __int128)t2 + (z >> 64);
 523         t2 = (uint64_t)z;
 524         t3 = t3 + (uint64_t)(z >> 64);
 525
 526         z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
 527         d[0] = (uint64_t)z;
 528         z = (unsigned __int128)t1 + (z >> 64);
 529         d[1] = (uint64_t)z;
 530         z = (unsigned __int128)t2 + (z >> 64);
 531         d[2] = (uint64_t)z;
 532         d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
 533
 534 #elif BR_UMUL128
 535
 536         uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
 537         unsigned char k;
 538
 539         t0 = _umul128(a[0], 121665, &h0);
 540         t1 = _umul128(a[1], 121665, &h1);
 541         k = _addcarry_u64(0, t1, h0, &t1);
 542         t2 = _umul128(a[2], 121665, &h2);
 543         k = _addcarry_u64(k, t2, h1, &t2);
 544         t3 = _umul128(a[3], 121665, &h3);
 545         k = _addcarry_u64(k, t3, h2, &t3);
 546         (void)_addcarry_u64(k, h3, 0, &t4);
 547
 548         t4 = (t4 << 1) + (t3 >> 63);
 549         t3 &= MASK63;
 550         k = _addcarry_u64(0, t0, 19 * t4, &t0);
 551         k = _addcarry_u64(k, t1, 0, &t1);
 552         k = _addcarry_u64(k, t2, 0, &t2);
 553         (void)_addcarry_u64(k, t3, 0, &t3);
 554
 555         t4 = 19 & -(t3 >> 63);
 556         t3 &= MASK63;
 557         k = _addcarry_u64(0, t0, t4, &d[0]);
 558         k = _addcarry_u64(k, t1, 0, &d[1]);
 559         k = _addcarry_u64(k, t2, 0, &d[2]);
 560         (void)_addcarry_u64(k, t3, 0, &d[3]);
 561
 562 #endif
 563 }
 564
 565 /*
 566  * Finalize reduction.
 567  */
 568 static inline void
 569 f255_final_reduce(uint64_t *a)
 570 {
 571 #if BR_INT128
 572
 573         uint64_t t0, t1, t2, t3, m;
 574         unsigned __int128 z;
 575
 576         /*
 577          * We add 19. If the result (in t) is below 2^255, then a[]
 578          * is already less than 2^255-19, thus already reduced.
 579          * Otherwise, we subtract 2^255 from t[], in which case we
 580          * have t = a - (2^255-19), and that's our result.
 581          */
 582         z = (unsigned __int128)a[0] + 19;
 583         t0 = (uint64_t)z;
 584         z = (unsigned __int128)a[1] + (z >> 64);
 585         t1 = (uint64_t)z;
 586         z = (unsigned __int128)a[2] + (z >> 64);
 587         t2 = (uint64_t)z;
 588         t3 = a[3] + (uint64_t)(z >> 64);
 589
 590         m = -(t3 >> 63);
 591         t3 &= MASK63;
 592         a[0] ^= m & (a[0] ^ t0);
 593         a[1] ^= m & (a[1] ^ t1);
 594         a[2] ^= m & (a[2] ^ t2);
 595         a[3] ^= m & (a[3] ^ t3);
 596
 597 #elif BR_UMUL128
 598
 599         uint64_t t0, t1, t2, t3, m;
 600         unsigned char k;
 601
 602         /*
 603          * We add 19. If the result (in t) is below 2^255, then a[]
 604          * is already less than 2^255-19, thus already reduced.
 605          * Otherwise, we subtract 2^255 from t[], in which case we
 606          * have t = a - (2^255-19), and that's our result.
 607          */
 608         k = _addcarry_u64(0, a[0], 19, &t0);
 609         k = _addcarry_u64(k, a[1], 0, &t1);
 610         k = _addcarry_u64(k, a[2], 0, &t2);
 611         (void)_addcarry_u64(k, a[3], 0, &t3);
 612
 613         m = -(t3 >> 63);
 614         t3 &= MASK63;
 615         a[0] ^= m & (a[0] ^ t0);
 616         a[1] ^= m & (a[1] ^ t1);
 617         a[2] ^= m & (a[2] ^ t2);
 618         a[3] ^= m & (a[3] ^ t3);
 619
 620 #endif
 621 }
 622
 623 static uint32_t
 624 api_mul(unsigned char *G, size_t Glen,
 625         const unsigned char *kb, size_t kblen, int curve)
 626 {
 627         unsigned char k[32];
 628         uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
 629         uint32_t swap;
 630         int i;
 631
 632         (void)curve;
 633
 634         /*
 635          * Points are encoded over exactly 32 bytes. Multipliers must fit
 636          * in 32 bytes as well.
 637          */
 638         if (Glen != 32 || kblen > 32) {
 639                 return 0;
 640         }
 641
 642         /*
 643          * RFC 7748 mandates that the high bit of the last point byte must
 644          * be ignored/cleared.
 645          */
 646         x1[0] = br_dec64le(&G[ 0]);
 647         x1[1] = br_dec64le(&G[ 8]);
 648         x1[2] = br_dec64le(&G[16]);
 649         x1[3] = br_dec64le(&G[24]) & MASK63;
 650
 651         /*
 652          * We can use memset() to clear values, because exact-width types
 653          * like uint64_t are guaranteed to have no padding bits or
 654          * trap representations.
 655          */
 656         memset(x2, 0, sizeof x2);
 657         x2[0] = 1;
 658         memset(z2, 0, sizeof z2);
 659         memcpy(x3, x1, sizeof x1);
 660         memcpy(z3, x2, sizeof x2);
 661
 662         /*
 663          * The multiplier is provided in big-endian notation, and
 664          * possibly shorter than 32 bytes.
 665          */
 666         memset(k, 0, (sizeof k) - kblen);
 667         memcpy(k + (sizeof k) - kblen, kb, kblen);
 668         k[31] &= 0xF8;
 669         k[0] &= 0x7F;
 670         k[0] |= 0x40;
 671
 672         swap = 0;
 673
 674         for (i = 254; i >= 0; i --) {
 675                 uint64_t a[4], aa[4], b[4], bb[4], e[4];
 676                 uint64_t c[4], d[4], da[4], cb[4];
 677                 uint32_t kt;
 678
 679                 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 680                 swap ^= kt;
 681                 f255_cswap(x2, x3, swap);
 682                 f255_cswap(z2, z3, swap);
 683                 swap = kt;
 684
 685                 /* A = x_2 + z_2 */
 686                 f255_add(a, x2, z2);
 687
 688                 /* AA = A^2 */
 689                 f255_mul(aa, a, a);
 690
 691                 /* B = x_2 - z_2 */
 692                 f255_sub(b, x2, z2);
 693
 694                 /* BB = B^2 */
 695                 f255_mul(bb, b, b);
 696
 697                 /* E = AA - BB */
 698                 f255_sub(e, aa, bb);
 699
 700                 /* C = x_3 + z_3 */
 701                 f255_add(c, x3, z3);
 702
 703                 /* D = x_3 - z_3 */
 704                 f255_sub(d, x3, z3);
 705
 706                 /* DA = D * A */
 707                 f255_mul(da, d, a);
 708
 709                 /* CB = C * B */
 710                 f255_mul(cb, c, b);
 711
 712                 /* x_3 = (DA + CB)^2 */
 713                 f255_add(x3, da, cb);
 714                 f255_mul(x3, x3, x3);
 715
 716                 /* z_3 = x_1 * (DA - CB)^2 */
 717                 f255_sub(z3, da, cb);
 718                 f255_mul(z3, z3, z3);
 719                 f255_mul(z3, x1, z3);
 720
 721                 /* x_2 = AA * BB */
 722                 f255_mul(x2, aa, bb);
 723
 724                 /* z_2 = E * (AA + a24 * E) */
 725                 f255_mul_a24(z2, e);
 726                 f255_add(z2, aa, z2);
 727                 f255_mul(z2, e, z2);
 728         }
 729
 730         f255_cswap(x2, x3, swap);
 731         f255_cswap(z2, z3, swap);
 732
 733         /*
 734          * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
 735          * most non-squarings. We use x1 and x3, now useless, as temporaries.
 736          */
 737         memcpy(x1, z2, sizeof z2);
 738         for (i = 0; i < 15; i ++) {
 739                 f255_mul(x1, x1, x1);
 740                 f255_mul(x1, x1, z2);
 741         }
 742         memcpy(x3, x1, sizeof x1);
 743         for (i = 0; i < 14; i ++) {
 744                 int j;
 745
 746                 for (j = 0; j < 16; j ++) {
 747                         f255_mul(x3, x3, x3);
 748                 }
 749                 f255_mul(x3, x3, x1);
 750         }
 751         for (i = 14; i >= 0; i --) {
 752                 f255_mul(x3, x3, x3);
 753                 if ((0xFFEB >> i) & 1) {
 754                         f255_mul(x3, z2, x3);
 755                 }
 756         }
 757
 758         /*
 759          * Compute x2/z2. We have 1/z2 in x3.
 760          */
 761         f255_mul(x2, x2, x3);
 762         f255_final_reduce(x2);
 763
 764         /*
 765          * Encode the final x2 value in little-endian.
 766          */
 767         br_enc64le(G,      x2[0]);
 768         br_enc64le(G +  8, x2[1]);
 769         br_enc64le(G + 16, x2[2]);
 770         br_enc64le(G + 24, x2[3]);
 771         return 1;
 772 }
 773
 774 static size_t
 775 api_mulgen(unsigned char *R,
 776         const unsigned char *x, size_t xlen, int curve)
 777 {
 778         const unsigned char *G;
 779         size_t Glen;
 780
 781         G = api_generator(curve, &Glen);
 782         memcpy(R, G, Glen);
 783         api_mul(R, Glen, x, xlen, curve);
 784         return Glen;
 785 }
 786
 787 static uint32_t
 788 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 789         const unsigned char *x, size_t xlen,
 790         const unsigned char *y, size_t ylen, int curve)
 791 {
 792         /*
 793          * We don't implement this method, since it is used for ECDSA
 794          * only, and there is no ECDSA over Curve25519 (which instead
 795          * uses EdDSA).
 796          */
 797         (void)A;
 798         (void)B;
 799         (void)len;
 800         (void)x;
 801         (void)xlen;
 802         (void)y;
 803         (void)ylen;
 804         (void)curve;
 805         return 0;
 806 }
 807
 808 /* see bearssl_ec.h */
 809 const br_ec_impl br_ec_c25519_m64 = {
 810         (uint32_t)0x20000000,
 811         &api_generator,
 812         &api_order,
 813         &api_xoff,
 814         &api_mul,
 815         &api_mulgen,
 816         &api_muladd
 817 };
 818
 819 /* see bearssl_ec.h */
 820 const br_ec_impl *
 821 br_ec_c25519_m64_get(void)
 822 {
 823         return &br_ec_c25519_m64;
 824 }
 825
 826 #else
 827
 828 /* see bearssl_ec.h */
 829 const br_ec_impl *
 830 br_ec_c25519_m64_get(void)
 831 {
 832         return 0;
 833 }
 834
 835 #endif