_ Git - BearSSL/blob - src/ec/ec_c25519_m64.c

   1 /*
   2  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 #if BR_INT128 || BR_UMUL128
  28
  29 #if BR_UMUL128
  30 #include <intrin.h>
  31 #endif
  32
  33 static const unsigned char GEN[] = {
  34         0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  35         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  36         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  37         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
  38 };
  39
  40 static const unsigned char ORDER[] = {
  41         0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  42         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  43         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
  44         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
  45 };
  46
  47 static const unsigned char *
  48 api_generator(int curve, size_t *len)
  49 {
  50         (void)curve;
  51         *len = 32;
  52         return GEN;
  53 }
  54
  55 static const unsigned char *
  56 api_order(int curve, size_t *len)
  57 {
  58         (void)curve;
  59         *len = 32;
  60         return ORDER;
  61 }
  62
  63 static size_t
  64 api_xoff(int curve, size_t *len)
  65 {
  66         (void)curve;
  67         *len = 32;
  68         return 0;
  69 }
  70
  71 /*
  72  * A field element is encoded as four 64-bit integers, in basis 2^63.
  73  * Operations return partially reduced values, which may range up to
  74  * 2^255+37.
  75  */
  76
  77 #define MASK63   (((uint64_t)1 << 63) - (uint64_t)1)
  78
  79 /*
  80  * Swap two field elements, conditionally on a flag.
  81  */
  82 static inline void
  83 f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
  84 {
  85         uint64_t m, w;
  86
  87         m = -(uint64_t)ctl;
  88         w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
  89         w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
  90         w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
  91         w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
  92 }
  93
  94 /*
  95  * Addition in the field.
  96  */
  97 static inline void
  98 f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
  99 {
 100 #if BR_INT128
 101
 102         uint64_t t0, t1, t2, t3, cc;
 103         unsigned __int128 z;
 104
 105         z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
 106         t0 = (uint64_t)z;
 107         z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
 108         t1 = (uint64_t)z;
 109         z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
 110         t2 = (uint64_t)z;
 111         z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
 112         t3 = (uint64_t)z & MASK63;
 113         cc = (uint64_t)(z >> 63);
 114
 115         /*
 116          * Since operands are at most 2^255+37, the sum is at most
 117          * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
 118          *
 119          * We use: 2^255 = 19 mod p.
 120          * Since we add 0, 19 or 38 to a value that fits on 255 bits,
 121          * the result is at most 2^255+37.
 122          */
 123         z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
 124         d[0] = (uint64_t)z;
 125         z = (unsigned __int128)t1 + (z >> 64);
 126         d[1] = (uint64_t)z;
 127         z = (unsigned __int128)t2 + (z >> 64);
 128         d[2] = (uint64_t)z;
 129         d[3] = t3 + (uint64_t)(z >> 64);
 130
 131 #elif BR_UMUL128
 132
 133         uint64_t t0, t1, t2, t3, cc;
 134         unsigned char k;
 135
 136         k = _addcarry_u64(0, a[0], b[0], &t0);
 137         k = _addcarry_u64(k, a[1], b[1], &t1);
 138         k = _addcarry_u64(k, a[2], b[2], &t2);
 139         k = _addcarry_u64(k, a[3], b[3], &t3);
 140         cc = (k << 1) + (t3 >> 63);
 141         t3 &= MASK63;
 142
 143         /*
 144          * Since operands are at most 2^255+37, the sum is at most
 145          * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
 146          *
 147          * We use: 2^255 = 19 mod p.
 148          * Since we add 0, 19 or 38 to a value that fits on 255 bits,
 149          * the result is at most 2^255+37.
 150          */
 151         k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
 152         k = _addcarry_u64(k, t1, 0, &d[1]);
 153         k = _addcarry_u64(k, t2, 0, &d[2]);
 154         (void)_addcarry_u64(k, t3, 0, &d[3]);
 155
 156 #endif
 157 }
 158
 159 /*
 160  * Subtraction.
 161  */
 162 static inline void
 163 f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
 164 {
 165 #if BR_INT128
 166
 167         /*
 168          * We compute t = 2^256 - 38 + a - b, which is necessarily
 169          * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
 170          * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
 171          * on the two upper bits of t (bits 255 and 256).
 172          */
 173
 174         uint64_t t0, t1, t2, t3, t4, cc;
 175         unsigned __int128 z;
 176
 177         z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
 178         t0 = (uint64_t)z;
 179         cc = -(uint64_t)(z >> 64);
 180         z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
 181                 - (unsigned __int128)cc;
 182         t1 = (uint64_t)z;
 183         cc = -(uint64_t)(z >> 64);
 184         z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
 185                 - (unsigned __int128)cc;
 186         t2 = (uint64_t)z;
 187         cc = -(uint64_t)(z >> 64);
 188         z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
 189                 - (unsigned __int128)cc;
 190         t3 = (uint64_t)z;
 191         t4 = 1 + (uint64_t)(z >> 64);
 192
 193         /*
 194          * We have a 257-bit result. The two top bits can be 00, 01 or 10,
 195          * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
 196          * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
 197          * This guarantees that the result is at most 2^255+37.
 198          */
 199         cc = (38 & -t4) + (19 & -(t3 >> 63));
 200         t3 &= MASK63;
 201         z = (unsigned __int128)t0 + (unsigned __int128)cc;
 202         d[0] = (uint64_t)z;
 203         z = (unsigned __int128)t1 + (z >> 64);
 204         d[1] = (uint64_t)z;
 205         z = (unsigned __int128)t2 + (z >> 64);
 206         d[2] = (uint64_t)z;
 207         d[3] = t3 + (uint64_t)(z >> 64);
 208
 209 #elif BR_UMUL128
 210
 211         /*
 212          * We compute t = 2^256 - 38 + a - b, which is necessarily
 213          * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
 214          * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
 215          * on the two upper bits of t (bits 255 and 256).
 216          */
 217
 218         uint64_t t0, t1, t2, t3, t4;
 219         unsigned char k;
 220
 221         k = _subborrow_u64(0, a[0], b[0], &t0);
 222         k = _subborrow_u64(k, a[1], b[1], &t1);
 223         k = _subborrow_u64(k, a[2], b[2], &t2);
 224         k = _subborrow_u64(k, a[3], b[3], &t3);
 225         (void)_subborrow_u64(k, 1, 0, &t4);
 226
 227         k = _subborrow_u64(0, t0, 38, &t0);
 228         k = _subborrow_u64(k, t1, 0, &t1);
 229         k = _subborrow_u64(k, t2, 0, &t2);
 230         k = _subborrow_u64(k, t3, 0, &t3);
 231         (void)_subborrow_u64(k, t4, 0, &t4);
 232
 233         /*
 234          * We have a 257-bit result. The two top bits can be 00, 01 or 10,
 235          * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
 236          * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
 237          * This guarantees that the result is at most 2^255+37.
 238          */
 239         t4 = (38 & -t4) + (19 & -(t3 >> 63));
 240         t3 &= MASK63;
 241         k = _addcarry_u64(0, t0, t4, &d[0]);
 242         k = _addcarry_u64(k, t1, 0, &d[1]);
 243         k = _addcarry_u64(k, t2, 0, &d[2]);
 244         (void)_addcarry_u64(k, t3, 0, &d[3]);
 245
 246 #endif
 247 }
 248
 249 /*
 250  * Multiplication.
 251  */
 252 static inline void
 253 f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
 254 {
 255 #if BR_INT128
 256
 257         unsigned __int128 z;
 258         uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
 259
 260         /*
 261          * Compute the product a*b over plain integers.
 262          */
 263         z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
 264         t0 = (uint64_t)z;
 265         z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
 266         t1 = (uint64_t)z;
 267         z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
 268         t2 = (uint64_t)z;
 269         z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
 270         t3 = (uint64_t)z;
 271         t4 = (uint64_t)(z >> 64);
 272
 273         z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
 274                 + (unsigned __int128)t1;
 275         t1 = (uint64_t)z;
 276         z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
 277                 + (unsigned __int128)t2 + (z >> 64);
 278         t2 = (uint64_t)z;
 279         z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
 280                 + (unsigned __int128)t3 + (z >> 64);
 281         t3 = (uint64_t)z;
 282         z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
 283                 + (unsigned __int128)t4 + (z >> 64);
 284         t4 = (uint64_t)z;
 285         t5 = (uint64_t)(z >> 64);
 286
 287         z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
 288                 + (unsigned __int128)t2;
 289         t2 = (uint64_t)z;
 290         z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
 291                 + (unsigned __int128)t3 + (z >> 64);
 292         t3 = (uint64_t)z;
 293         z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
 294                 + (unsigned __int128)t4 + (z >> 64);
 295         t4 = (uint64_t)z;
 296         z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
 297                 + (unsigned __int128)t5 + (z >> 64);
 298         t5 = (uint64_t)z;
 299         t6 = (uint64_t)(z >> 64);
 300
 301         z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
 302                 + (unsigned __int128)t3;
 303         t3 = (uint64_t)z;
 304         z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
 305                 + (unsigned __int128)t4 + (z >> 64);
 306         t4 = (uint64_t)z;
 307         z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
 308                 + (unsigned __int128)t5 + (z >> 64);
 309         t5 = (uint64_t)z;
 310         z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
 311                 + (unsigned __int128)t6 + (z >> 64);
 312         t6 = (uint64_t)z;
 313         t7 = (uint64_t)(z >> 64);
 314
 315         /*
 316          * Modulo p, we have:
 317          *
 318          *   2^255 = 19
 319          *   2^510 = 19*19 = 361
 320          *
 321          * We split the intermediate t into three parts, in basis
 322          * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
 323          * The upper one can only be a single bit (th), since the
 324          * multiplication operands are at most 2^255+37 each.
 325          */
 326         th = t7 >> 62;
 327         t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
 328         t6 = (t6 << 1) | (t5 >> 63);
 329         t5 = (t5 << 1) | (t4 >> 63);
 330         t4 = (t4 << 1) | (t3 >> 63);
 331         t3 &= MASK63;
 332
 333         /*
 334          * Multiply the middle part (t4..t7) by 19. We truncate it to
 335          * 255 bits; the extra bits will go along with th.
 336          */
 337         z = (unsigned __int128)t4 * 19;
 338         t4 = (uint64_t)z;
 339         z = (unsigned __int128)t5 * 19 + (z >> 64);
 340         t5 = (uint64_t)z;
 341         z = (unsigned __int128)t6 * 19 + (z >> 64);
 342         t6 = (uint64_t)z;
 343         z = (unsigned __int128)t7 * 19 + (z >> 64);
 344         t7 = (uint64_t)z & MASK63;
 345
 346         th = (361 & -th) + (19 * (uint64_t)(z >> 63));
 347
 348         /*
 349          * Add elements together.
 350          * At this point:
 351          *   t0..t3 fits on 255 bits.
 352          *   t4..t7 fits on 255 bits.
 353          *   th <= 361 + 342 = 703.
 354          */
 355         z = (unsigned __int128)t0 + (unsigned __int128)t4
 356                 + (unsigned __int128)th;
 357         t0 = (uint64_t)z;
 358         z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
 359         t1 = (uint64_t)z;
 360         z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
 361         t2 = (uint64_t)z;
 362         z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
 363         t3 = (uint64_t)z & MASK63;
 364         th = (uint64_t)(z >> 63);
 365
 366         /*
 367          * Since the sum is at most 2^256 + 703, the two upper bits, in th,
 368          * can only have value 0, 1 or 2. We just add th*19, which
 369          * guarantees a result of at most 2^255+37.
 370          */
 371         z = (unsigned __int128)t0 + (19 * th);
 372         d[0] = (uint64_t)z;
 373         z = (unsigned __int128)t1 + (z >> 64);
 374         d[1] = (uint64_t)z;
 375         z = (unsigned __int128)t2 + (z >> 64);
 376         d[2] = (uint64_t)z;
 377         d[3] = t3 + (uint64_t)(z >> 64);
 378
 379 #elif BR_UMUL128
 380
 381         uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
 382         uint64_t h0, h1, h2, h3;
 383         unsigned char k;
 384
 385         /*
 386          * Compute the product a*b over plain integers.
 387          */
 388         t0 = _umul128(a[0], b[0], &h0);
 389         t1 = _umul128(a[0], b[1], &h1);
 390         k = _addcarry_u64(0, t1, h0, &t1);
 391         t2 = _umul128(a[0], b[2], &h2);
 392         k = _addcarry_u64(k, t2, h1, &t2);
 393         t3 = _umul128(a[0], b[3], &h3);
 394         k = _addcarry_u64(k, t3, h2, &t3);
 395         (void)_addcarry_u64(k, h3, 0, &t4);
 396
 397         k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
 398         k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
 399         k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
 400         k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
 401         t5 = k;
 402         k = _addcarry_u64(0, t2, h0, &t2);
 403         k = _addcarry_u64(k, t3, h1, &t3);
 404         k = _addcarry_u64(k, t4, h2, &t4);
 405         (void)_addcarry_u64(k, t5, h3, &t5);
 406
 407         k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
 408         k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
 409         k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
 410         k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
 411         t6 = k;
 412         k = _addcarry_u64(0, t3, h0, &t3);
 413         k = _addcarry_u64(k, t4, h1, &t4);
 414         k = _addcarry_u64(k, t5, h2, &t5);
 415         (void)_addcarry_u64(k, t6, h3, &t6);
 416
 417         k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
 418         k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
 419         k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
 420         k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
 421         t7 = k;
 422         k = _addcarry_u64(0, t4, h0, &t4);
 423         k = _addcarry_u64(k, t5, h1, &t5);
 424         k = _addcarry_u64(k, t6, h2, &t6);
 425         (void)_addcarry_u64(k, t7, h3, &t7);
 426
 427         /*
 428          * Modulo p, we have:
 429          *
 430          *   2^255 = 19
 431          *   2^510 = 19*19 = 361
 432          *
 433          * We split the intermediate t into three parts, in basis
 434          * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
 435          * The upper one can only be a single bit (th), since the
 436          * multiplication operands are at most 2^255+37 each.
 437          */
 438         th = t7 >> 62;
 439         t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
 440         t6 = (t6 << 1) | (t5 >> 63);
 441         t5 = (t5 << 1) | (t4 >> 63);
 442         t4 = (t4 << 1) | (t3 >> 63);
 443         t3 &= MASK63;
 444
 445         /*
 446          * Multiply the middle part (t4..t7) by 19. We truncate it to
 447          * 255 bits; the extra bits will go along with th.
 448          */
 449         t4 = _umul128(t4, 19, &h0);
 450         t5 = _umul128(t5, 19, &h1);
 451         t6 = _umul128(t6, 19, &h2);
 452         t7 = _umul128(t7, 19, &h3);
 453         k = _addcarry_u64(0, t5, h0, &t5);
 454         k = _addcarry_u64(k, t6, h1, &t6);
 455         k = _addcarry_u64(k, t7, h2, &t7);
 456         (void)_addcarry_u64(k, h3, 0, &h3);
 457         th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
 458         t7 &= MASK63;
 459
 460         /*
 461          * Add elements together.
 462          * At this point:
 463          *   t0..t3 fits on 255 bits.
 464          *   t4..t7 fits on 255 bits.
 465          *   th <= 361 + 342 = 703.
 466          */
 467         k = _addcarry_u64(0, t0, t4, &t0);
 468         k = _addcarry_u64(k, t1, t5, &t1);
 469         k = _addcarry_u64(k, t2, t6, &t2);
 470         k = _addcarry_u64(k, t3, t7, &t3);
 471         t4 = k;
 472         k = _addcarry_u64(0, t0, th, &t0);
 473         k = _addcarry_u64(k, t1, 0, &t1);
 474         k = _addcarry_u64(k, t2, 0, &t2);
 475         k = _addcarry_u64(k, t3, 0, &t3);
 476         (void)_addcarry_u64(k, t4, 0, &t4);
 477
 478         th = (t4 << 1) + (t3 >> 63);
 479         t3 &= MASK63;
 480
 481         /*
 482          * Since the sum is at most 2^256 + 703, the two upper bits, in th,
 483          * can only have value 0, 1 or 2. We just add th*19, which
 484          * guarantees a result of at most 2^255+37.
 485          */
 486         k = _addcarry_u64(0, t0, 19 * th, &d[0]);
 487         k = _addcarry_u64(k, t1, 0, &d[1]);
 488         k = _addcarry_u64(k, t2, 0, &d[2]);
 489         (void)_addcarry_u64(k, t3, 0, &d[3]);
 490
 491 #endif
 492 }
 493
 494 /*
 495  * Multiplication by A24 = 121665.
 496  */
 497 static inline void
 498 f255_mul_a24(uint64_t *d, const uint64_t *a)
 499 {
 500 #if BR_INT128
 501
 502         uint64_t t0, t1, t2, t3;
 503         unsigned __int128 z;
 504
 505         z = (unsigned __int128)a[0] * 121665;
 506         t0 = (uint64_t)z;
 507         z = (unsigned __int128)a[1] * 121665 + (z >> 64);
 508         t1 = (uint64_t)z;
 509         z = (unsigned __int128)a[2] * 121665 + (z >> 64);
 510         t2 = (uint64_t)z;
 511         z = (unsigned __int128)a[3] * 121665 + (z >> 64);
 512         t3 = (uint64_t)z & MASK63;
 513
 514         z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
 515         t0 = (uint64_t)z;
 516         z = (unsigned __int128)t1 + (z >> 64);
 517         t1 = (uint64_t)z;
 518         z = (unsigned __int128)t2 + (z >> 64);
 519         t2 = (uint64_t)z;
 520         t3 = t3 + (uint64_t)(z >> 64);
 521
 522         z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
 523         d[0] = (uint64_t)z;
 524         z = (unsigned __int128)t1 + (z >> 64);
 525         d[1] = (uint64_t)z;
 526         z = (unsigned __int128)t2 + (z >> 64);
 527         d[2] = (uint64_t)z;
 528         d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
 529
 530 #elif BR_UMUL128
 531
 532         uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
 533         unsigned char k;
 534
 535         t0 = _umul128(a[0], 121665, &h0);
 536         t1 = _umul128(a[1], 121665, &h1);
 537         k = _addcarry_u64(0, t1, h0, &t1);
 538         t2 = _umul128(a[2], 121665, &h2);
 539         k = _addcarry_u64(k, t2, h1, &t2);
 540         t3 = _umul128(a[3], 121665, &h3);
 541         k = _addcarry_u64(k, t3, h2, &t3);
 542         (void)_addcarry_u64(k, h3, 0, &t4);
 543
 544         t4 = (t4 << 1) + (t3 >> 63);
 545         t3 &= MASK63;
 546         k = _addcarry_u64(0, t0, 19 * t4, &t0);
 547         k = _addcarry_u64(k, t1, 0, &t1);
 548         k = _addcarry_u64(k, t2, 0, &t2);
 549         (void)_addcarry_u64(k, t3, 0, &t3);
 550
 551         t4 = 19 & -(t3 >> 63);
 552         t3 &= MASK63;
 553         k = _addcarry_u64(0, t0, t4, &d[0]);
 554         k = _addcarry_u64(k, t1, 0, &d[1]);
 555         k = _addcarry_u64(k, t2, 0, &d[2]);
 556         (void)_addcarry_u64(k, t3, 0, &d[3]);
 557
 558 #endif
 559 }
 560
 561 /*
 562  * Finalize reduction.
 563  */
 564 static inline void
 565 f255_final_reduce(uint64_t *a)
 566 {
 567 #if BR_INT128
 568
 569         uint64_t t0, t1, t2, t3, m;
 570         unsigned __int128 z;
 571
 572         /*
 573          * We add 19. If the result (in t) is below 2^255, then a[]
 574          * is already less than 2^255-19, thus already reduced.
 575          * Otherwise, we subtract 2^255 from t[], in which case we
 576          * have t = a - (2^255-19), and that's our result.
 577          */
 578         z = (unsigned __int128)a[0] + 19;
 579         t0 = (uint64_t)z;
 580         z = (unsigned __int128)a[1] + (z >> 64);
 581         t1 = (uint64_t)z;
 582         z = (unsigned __int128)a[2] + (z >> 64);
 583         t2 = (uint64_t)z;
 584         t3 = a[3] + (uint64_t)(z >> 64);
 585
 586         m = -(t3 >> 63);
 587         t3 &= MASK63;
 588         a[0] ^= m & (a[0] ^ t0);
 589         a[1] ^= m & (a[1] ^ t1);
 590         a[2] ^= m & (a[2] ^ t2);
 591         a[3] ^= m & (a[3] ^ t3);
 592
 593 #elif BR_UMUL128
 594
 595         uint64_t t0, t1, t2, t3, m;
 596         unsigned char k;
 597
 598         /*
 599          * We add 19. If the result (in t) is below 2^255, then a[]
 600          * is already less than 2^255-19, thus already reduced.
 601          * Otherwise, we subtract 2^255 from t[], in which case we
 602          * have t = a - (2^255-19), and that's our result.
 603          */
 604         k = _addcarry_u64(0, a[0], 19, &t0);
 605         k = _addcarry_u64(k, a[1], 0, &t1);
 606         k = _addcarry_u64(k, a[2], 0, &t2);
 607         (void)_addcarry_u64(k, a[3], 0, &t3);
 608
 609         m = -(t3 >> 63);
 610         t3 &= MASK63;
 611         a[0] ^= m & (a[0] ^ t0);
 612         a[1] ^= m & (a[1] ^ t1);
 613         a[2] ^= m & (a[2] ^ t2);
 614         a[3] ^= m & (a[3] ^ t3);
 615
 616 #endif
 617 }
 618
 619 static uint32_t
 620 api_mul(unsigned char *G, size_t Glen,
 621         const unsigned char *kb, size_t kblen, int curve)
 622 {
 623         unsigned char k[32];
 624         uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
 625         uint32_t swap;
 626         int i;
 627
 628         (void)curve;
 629
 630         /*
 631          * Points are encoded over exactly 32 bytes. Multipliers must fit
 632          * in 32 bytes as well.
 633          */
 634         if (Glen != 32 || kblen > 32) {
 635                 return 0;
 636         }
 637
 638         /*
 639          * RFC 7748 mandates that the high bit of the last point byte must
 640          * be ignored/cleared.
 641          */
 642         x1[0] = br_dec64le(&G[ 0]);
 643         x1[1] = br_dec64le(&G[ 8]);
 644         x1[2] = br_dec64le(&G[16]);
 645         x1[3] = br_dec64le(&G[24]) & MASK63;
 646
 647         /*
 648          * We can use memset() to clear values, because exact-width types
 649          * like uint64_t are guaranteed to have no padding bits or
 650          * trap representations.
 651          */
 652         memset(x2, 0, sizeof x2);
 653         x2[0] = 1;
 654         memset(z2, 0, sizeof z2);
 655         memcpy(x3, x1, sizeof x1);
 656         memcpy(z3, x2, sizeof x2);
 657
 658         /*
 659          * The multiplier is provided in big-endian notation, and
 660          * possibly shorter than 32 bytes.
 661          */
 662         memset(k, 0, (sizeof k) - kblen);
 663         memcpy(k + (sizeof k) - kblen, kb, kblen);
 664         k[31] &= 0xF8;
 665         k[0] &= 0x7F;
 666         k[0] |= 0x40;
 667
 668         swap = 0;
 669
 670         for (i = 254; i >= 0; i --) {
 671                 uint64_t a[4], aa[4], b[4], bb[4], e[4];
 672                 uint64_t c[4], d[4], da[4], cb[4];
 673                 uint32_t kt;
 674
 675                 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
 676                 swap ^= kt;
 677                 f255_cswap(x2, x3, swap);
 678                 f255_cswap(z2, z3, swap);
 679                 swap = kt;
 680
 681                 /* A = x_2 + z_2 */
 682                 f255_add(a, x2, z2);
 683
 684                 /* AA = A^2 */
 685                 f255_mul(aa, a, a);
 686
 687                 /* B = x_2 - z_2 */
 688                 f255_sub(b, x2, z2);
 689
 690                 /* BB = B^2 */
 691                 f255_mul(bb, b, b);
 692
 693                 /* E = AA - BB */
 694                 f255_sub(e, aa, bb);
 695
 696                 /* C = x_3 + z_3 */
 697                 f255_add(c, x3, z3);
 698
 699                 /* D = x_3 - z_3 */
 700                 f255_sub(d, x3, z3);
 701
 702                 /* DA = D * A */
 703                 f255_mul(da, d, a);
 704
 705                 /* CB = C * B */
 706                 f255_mul(cb, c, b);
 707
 708                 /* x_3 = (DA + CB)^2 */
 709                 f255_add(x3, da, cb);
 710                 f255_mul(x3, x3, x3);
 711
 712                 /* z_3 = x_1 * (DA - CB)^2 */
 713                 f255_sub(z3, da, cb);
 714                 f255_mul(z3, z3, z3);
 715                 f255_mul(z3, x1, z3);
 716
 717                 /* x_2 = AA * BB */
 718                 f255_mul(x2, aa, bb);
 719
 720                 /* z_2 = E * (AA + a24 * E) */
 721                 f255_mul_a24(z2, e);
 722                 f255_add(z2, aa, z2);
 723                 f255_mul(z2, e, z2);
 724         }
 725
 726         f255_cswap(x2, x3, swap);
 727         f255_cswap(z2, z3, swap);
 728
 729         /*
 730          * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
 731          * most non-squarings. We use x1 and x3, now useless, as temporaries.
 732          */
 733         memcpy(x1, z2, sizeof z2);
 734         for (i = 0; i < 15; i ++) {
 735                 f255_mul(x1, x1, x1);
 736                 f255_mul(x1, x1, z2);
 737         }
 738         memcpy(x3, x1, sizeof x1);
 739         for (i = 0; i < 14; i ++) {
 740                 int j;
 741
 742                 for (j = 0; j < 16; j ++) {
 743                         f255_mul(x3, x3, x3);
 744                 }
 745                 f255_mul(x3, x3, x1);
 746         }
 747         for (i = 14; i >= 0; i --) {
 748                 f255_mul(x3, x3, x3);
 749                 if ((0xFFEB >> i) & 1) {
 750                         f255_mul(x3, z2, x3);
 751                 }
 752         }
 753
 754         /*
 755          * Compute x2/z2. We have 1/z2 in x3.
 756          */
 757         f255_mul(x2, x2, x3);
 758         f255_final_reduce(x2);
 759
 760         /*
 761          * Encode the final x2 value in little-endian.
 762          */
 763         br_enc64le(G,      x2[0]);
 764         br_enc64le(G +  8, x2[1]);
 765         br_enc64le(G + 16, x2[2]);
 766         br_enc64le(G + 24, x2[3]);
 767         return 1;
 768 }
 769
 770 static size_t
 771 api_mulgen(unsigned char *R,
 772         const unsigned char *x, size_t xlen, int curve)
 773 {
 774         const unsigned char *G;
 775         size_t Glen;
 776
 777         G = api_generator(curve, &Glen);
 778         memcpy(R, G, Glen);
 779         api_mul(R, Glen, x, xlen, curve);
 780         return Glen;
 781 }
 782
 783 static uint32_t
 784 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
 785         const unsigned char *x, size_t xlen,
 786         const unsigned char *y, size_t ylen, int curve)
 787 {
 788         /*
 789          * We don't implement this method, since it is used for ECDSA
 790          * only, and there is no ECDSA over Curve25519 (which instead
 791          * uses EdDSA).
 792          */
 793         (void)A;
 794         (void)B;
 795         (void)len;
 796         (void)x;
 797         (void)xlen;
 798         (void)y;
 799         (void)ylen;
 800         (void)curve;
 801         return 0;
 802 }
 803
 804 /* see bearssl_ec.h */
 805 const br_ec_impl br_ec_c25519_m64 = {
 806         (uint32_t)0x20000000,
 807         &api_generator,
 808         &api_order,
 809         &api_xoff,
 810         &api_mul,
 811         &api_mulgen,
 812         &api_muladd
 813 };
 814
 815 /* see bearssl_ec.h */
 816 const br_ec_impl *
 817 br_ec_c25519_m64_get(void)
 818 {
 819         return &br_ec_c25519_m64;
 820 }
 821
 822 #else
 823
 824 /* see bearssl_ec.h */
 825 const br_ec_impl *
 826 br_ec_c25519_m64_get(void)
 827 {
 828         return 0;
 829 }
 830
 831 #endif