44eb455f410059437d50962599d439bc73188fd6
[BearSSL] / src / ec / ec_c25519_m62.c
1 /*
2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 #if BR_INT128 || BR_UMUL128
28
29 static const unsigned char GEN[] = {
30 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
31 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
32 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
33 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34 };
35
36 static const unsigned char ORDER[] = {
37 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
38 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
39 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
40 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
41 };
42
43 static const unsigned char *
44 api_generator(int curve, size_t *len)
45 {
46 (void)curve;
47 *len = 32;
48 return GEN;
49 }
50
51 static const unsigned char *
52 api_order(int curve, size_t *len)
53 {
54 (void)curve;
55 *len = 32;
56 return ORDER;
57 }
58
59 static size_t
60 api_xoff(int curve, size_t *len)
61 {
62 (void)curve;
63 *len = 32;
64 return 0;
65 }
66
67 /*
68 * A field element is encoded as five 64-bit integers, in basis 2^51.
69 * Limbs may be occasionally larger than 2^51, to save on carry
70 * propagation costs.
71 */
72
73 #define MASK51 (((uint64_t)1 << 51) - (uint64_t)1)
74
75 /*
76 * Swap two field elements, conditionally on a flag.
77 */
78 static inline void
79 f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
80 {
81 uint64_t m, w;
82
83 m = -(uint64_t)ctl;
84 w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
85 w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
86 w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
87 w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
88 w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w;
89 }
90
91 /*
92 * Addition with no carry propagation. Limbs double in size.
93 */
94 static inline void
95 f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
96 {
97 d[0] = a[0] + b[0];
98 d[1] = a[1] + b[1];
99 d[2] = a[2] + b[2];
100 d[3] = a[3] + b[3];
101 d[4] = a[4] + b[4];
102 }
103
104 /*
105 * Subtraction.
106 * On input, limbs must fit on 60 bits each. On output, result is
107 * partially reduced, with max value 2^255+19456; moreover, all
108 * limbs will fit on 51 bits, except the low limb, which may have
109 * value up to 2^51+19455.
110 */
111 static inline void
112 f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
113 {
114 uint64_t cc, w;
115
116 /*
117 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
118 * fit on 60 bits, the maximum value of operands are slightly
119 * more than 2^264, but much less than 2^265-19456. This
120 * ensures that the result is positive.
121 */
122
123 /*
124 * Initial carry is 19456, since we add 2^265-19456. Each
125 * individual subtraction may yield a carry up to 513.
126 */
127 w = a[0] - b[0] - 19456;
128 d[0] = w & MASK51;
129 cc = -(w >> 51) & 0x3FF;
130 w = a[1] - b[1] - cc;
131 d[1] = w & MASK51;
132 cc = -(w >> 51) & 0x3FF;
133 w = a[2] - b[2] - cc;
134 d[2] = w & MASK51;
135 cc = -(w >> 51) & 0x3FF;
136 w = a[3] - b[3] - cc;
137 d[3] = w & MASK51;
138 cc = -(w >> 51) & 0x3FF;
139 d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc;
140
141 /*
142 * Partial reduction. The intermediate result may be up to
143 * slightly above 2^265, but less than 2^265+2^255. When we
144 * truncate to 255 bits, the upper bits will be at most 1024.
145 */
146 d[0] += 19 * (d[4] >> 51);
147 d[4] &= MASK51;
148 }
149
150 /*
151 * UMUL51(hi, lo, x, y) computes:
152 *
153 * hi = floor((x * y) / (2^51))
154 * lo = x * y mod 2^51
155 *
156 * Note that lo < 2^51, but "hi" may be larger, if the input operands are
157 * larger.
158 */
159 #if BR_INT128
160
161 #define UMUL51(hi, lo, x, y) do { \
162 unsigned __int128 umul_tmp; \
163 umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
164 (hi) = (uint64_t)(umul_tmp >> 51); \
165 (lo) = (uint64_t)umul_tmp & MASK51; \
166 } while (0)
167
168 #elif BR_UMUL128
169
170 #define UMUL51(hi, lo, x, y) do { \
171 uint64_t umul_hi, umul_lo; \
172 umul_lo = _umul128((x), (y), &umul_hi); \
173 (hi) = (umul_hi << 13) | (umul_lo >> 51); \
174 (lo) = umul_lo & MASK51; \
175 } while (0)
176
177 #endif
178
179 /*
180 * Multiplication.
181 * On input, limbs must fit on 54 bits each.
182 * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
183 * on 51 bits each.
184 */
185 static inline void
186 f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
187 {
188 uint64_t t[10], hi, lo, w, cc;
189
190 /*
191 * Perform cross products, accumulating values without carry
192 * propagation.
193 *
194 * Since input limbs fit on 54 bits each, each individual
195 * UMUL51 will produce a "hi" of less than 2^57. The maximum
196 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
197 * i.e. less than 324*2^51.
198 */
199
200 UMUL51(t[1], t[0], a[0], b[0]);
201
202 UMUL51(t[2], lo, a[1], b[0]); t[1] += lo;
203 UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi;
204
205 UMUL51(t[3], lo, a[2], b[0]); t[2] += lo;
206 UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi;
207 UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi;
208
209 UMUL51(t[4], lo, a[3], b[0]); t[3] += lo;
210 UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi;
211 UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi;
212 UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi;
213
214 UMUL51(t[5], lo, a[4], b[0]); t[4] += lo;
215 UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi;
216 UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi;
217 UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi;
218 UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi;
219
220 UMUL51(t[6], lo, a[4], b[1]); t[5] += lo;
221 UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi;
222 UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi;
223 UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi;
224
225 UMUL51(t[7], lo, a[4], b[2]); t[6] += lo;
226 UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi;
227 UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi;
228
229 UMUL51(t[8], lo, a[4], b[3]); t[7] += lo;
230 UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi;
231
232 UMUL51(t[9], lo, a[4], b[4]); t[8] += lo;
233
234 /*
235 * The upper words t[5]..t[9] are folded back into the lower
236 * words, using the rule that 2^255 = 19 in the field.
237 *
238 * Since each t[i] is less than 324*2^51, the additions below
239 * will yield less than 6480*2^51 in each limb; this fits in
240 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
241 * no overflow.
242 */
243 t[0] += 19 * t[5];
244 t[1] += 19 * t[6];
245 t[2] += 19 * t[7];
246 t[3] += 19 * t[8];
247 t[4] += 19 * t[9];
248
249 /*
250 * Propagate carries.
251 */
252 w = t[0];
253 d[0] = w & MASK51;
254 cc = w >> 51;
255 w = t[1] + cc;
256 d[1] = w & MASK51;
257 cc = w >> 51;
258 w = t[2] + cc;
259 d[2] = w & MASK51;
260 cc = w >> 51;
261 w = t[3] + cc;
262 d[3] = w & MASK51;
263 cc = w >> 51;
264 w = t[4] + cc;
265 d[4] = w & MASK51;
266 cc = w >> 51;
267
268 /*
269 * Since the limbs were 64-bit values, the top carry is at
270 * most 8192 (in practice, that cannot be reached). We simply
271 * performed a partial reduction.
272 */
273 d[0] += 19 * cc;
274 }
275
276 /*
277 * Multiplication by A24 = 121665.
278 * Input must have limbs of 60 bits at most.
279 */
280 static inline void
281 f255_mul_a24(uint64_t *d, const uint64_t *a)
282 {
283 uint64_t t[5], cc, w;
284
285 /*
286 * 121665 = 15 * 8111. We first multiply by 15, with carry
287 * propagation and partial reduction.
288 */
289 w = a[0] * 15;
290 t[0] = w & MASK51;
291 cc = w >> 51;
292 w = a[1] * 15 + cc;
293 t[1] = w & MASK51;
294 cc = w >> 51;
295 w = a[2] * 15 + cc;
296 t[2] = w & MASK51;
297 cc = w >> 51;
298 w = a[3] * 15 + cc;
299 t[3] = w & MASK51;
300 cc = w >> 51;
301 w = a[4] * 15 + cc;
302 t[4] = w & MASK51;
303 t[0] += 19 * (w >> 51);
304
305 /*
306 * Then multiplication by 8111. At that point, we known that
307 * t[0] is less than 2^51 + 19*8192, and other limbs are less
308 * than 2^51; thus, there will be no overflow.
309 */
310 w = t[0] * 8111;
311 d[0] = w & MASK51;
312 cc = w >> 51;
313 w = t[1] * 8111 + cc;
314 d[1] = w & MASK51;
315 cc = w >> 51;
316 w = t[2] * 8111 + cc;
317 d[2] = w & MASK51;
318 cc = w >> 51;
319 w = t[3] * 8111 + cc;
320 d[3] = w & MASK51;
321 cc = w >> 51;
322 w = t[4] * 8111 + cc;
323 d[4] = w & MASK51;
324 d[0] += 19 * (w >> 51);
325 }
326
327 /*
328 * Finalize reduction.
329 * On input, limbs must fit on 51 bits, except possibly the low limb,
330 * which may be slightly above 2^51.
331 */
332 static inline void
333 f255_final_reduce(uint64_t *a)
334 {
335 uint64_t t[5], cc, w;
336
337 /*
338 * We add 19. If the result (in t[]) is below 2^255, then a[]
339 * is already less than 2^255-19, thus already reduced.
340 * Otherwise, we subtract 2^255 from t[], in which case we
341 * have t = a - (2^255-19), and that's our result.
342 */
343 w = a[0] + 19;
344 t[0] = w & MASK51;
345 cc = w >> 51;
346 w = a[1] + cc;
347 t[1] = w & MASK51;
348 cc = w >> 51;
349 w = a[2] + cc;
350 t[2] = w & MASK51;
351 cc = w >> 51;
352 w = a[3] + cc;
353 t[3] = w & MASK51;
354 cc = w >> 51;
355 w = a[4] + cc;
356 t[4] = w & MASK51;
357 cc = w >> 51;
358
359 /*
360 * The bit 255 of t is in cc. If that bit is 0, when a[] must
361 * be unchanged; otherwise, it must be replaced with t[].
362 */
363 cc = -cc;
364 a[0] ^= cc & (a[0] ^ t[0]);
365 a[1] ^= cc & (a[1] ^ t[1]);
366 a[2] ^= cc & (a[2] ^ t[2]);
367 a[3] ^= cc & (a[3] ^ t[3]);
368 a[4] ^= cc & (a[4] ^ t[4]);
369 }
370
371 static uint32_t
372 api_mul(unsigned char *G, size_t Glen,
373 const unsigned char *kb, size_t kblen, int curve)
374 {
375 unsigned char k[32];
376 uint64_t x1[5], x2[5], z2[5], x3[5], z3[5];
377 uint32_t swap;
378 int i;
379
380 (void)curve;
381
382 /*
383 * Points are encoded over exactly 32 bytes. Multipliers must fit
384 * in 32 bytes as well.
385 */
386 if (Glen != 32 || kblen > 32) {
387 return 0;
388 }
389
390 /*
391 * RFC 7748 mandates that the high bit of the last point byte must
392 * be ignored/cleared; the "& MASK51" in the initialization for
393 * x1[4] clears that bit.
394 */
395 x1[0] = br_dec64le(&G[0]) & MASK51;
396 x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51;
397 x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51;
398 x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51;
399 x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51;
400
401 /*
402 * We can use memset() to clear values, because exact-width types
403 * like uint64_t are guaranteed to have no padding bits or
404 * trap representations.
405 */
406 memset(x2, 0, sizeof x2);
407 x2[0] = 1;
408 memset(z2, 0, sizeof z2);
409 memcpy(x3, x1, sizeof x1);
410 memcpy(z3, x2, sizeof x2);
411
412 /*
413 * The multiplier is provided in big-endian notation, and
414 * possibly shorter than 32 bytes.
415 */
416 memset(k, 0, (sizeof k) - kblen);
417 memcpy(k + (sizeof k) - kblen, kb, kblen);
418 k[31] &= 0xF8;
419 k[0] &= 0x7F;
420 k[0] |= 0x40;
421
422 swap = 0;
423
424 for (i = 254; i >= 0; i --) {
425 uint64_t a[5], aa[5], b[5], bb[5], e[5];
426 uint64_t c[5], d[5], da[5], cb[5];
427 uint32_t kt;
428
429 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
430 swap ^= kt;
431 f255_cswap(x2, x3, swap);
432 f255_cswap(z2, z3, swap);
433 swap = kt;
434
435 /*
436 * At that point, limbs of x_2 and z_2 are assumed to fit
437 * on at most 52 bits each.
438 *
439 * Each f255_add() adds one bit to the maximum range of
440 * the values, but f255_sub() and f255_mul() bring back
441 * the limbs into 52 bits. All f255_add() outputs are
442 * used only as inputs for f255_mul(), which ensures
443 * that limbs remain in the proper range.
444 */
445
446 /* A = x_2 + z_2 -- limbs fit on 53 bits each */
447 f255_add(a, x2, z2);
448
449 /* AA = A^2 */
450 f255_mul(aa, a, a);
451
452 /* B = x_2 - z_2 */
453 f255_sub(b, x2, z2);
454
455 /* BB = B^2 */
456 f255_mul(bb, b, b);
457
458 /* E = AA - BB */
459 f255_sub(e, aa, bb);
460
461 /* C = x_3 + z_3 -- limbs fit on 53 bits each */
462 f255_add(c, x3, z3);
463
464 /* D = x_3 - z_3 */
465 f255_sub(d, x3, z3);
466
467 /* DA = D * A */
468 f255_mul(da, d, a);
469
470 /* CB = C * B */
471 f255_mul(cb, c, b);
472
473 /* x_3 = (DA + CB)^2 */
474 f255_add(x3, da, cb);
475 f255_mul(x3, x3, x3);
476
477 /* z_3 = x_1 * (DA - CB)^2 */
478 f255_sub(z3, da, cb);
479 f255_mul(z3, z3, z3);
480 f255_mul(z3, x1, z3);
481
482 /* x_2 = AA * BB */
483 f255_mul(x2, aa, bb);
484
485 /* z_2 = E * (AA + a24 * E) */
486 f255_mul_a24(z2, e);
487 f255_add(z2, aa, z2);
488 f255_mul(z2, e, z2);
489 }
490
491 f255_cswap(x2, x3, swap);
492 f255_cswap(z2, z3, swap);
493
494 /*
495 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
496 * most non-squarings. We use x1 and x3, now useless, as temporaries.
497 */
498 memcpy(x1, z2, sizeof z2);
499 for (i = 0; i < 15; i ++) {
500 f255_mul(x1, x1, x1);
501 f255_mul(x1, x1, z2);
502 }
503 memcpy(x3, x1, sizeof x1);
504 for (i = 0; i < 14; i ++) {
505 int j;
506
507 for (j = 0; j < 16; j ++) {
508 f255_mul(x3, x3, x3);
509 }
510 f255_mul(x3, x3, x1);
511 }
512 for (i = 14; i >= 0; i --) {
513 f255_mul(x3, x3, x3);
514 if ((0xFFEB >> i) & 1) {
515 f255_mul(x3, z2, x3);
516 }
517 }
518
519 /*
520 * Compute x2/z2. We have 1/z2 in x3.
521 */
522 f255_mul(x2, x2, x3);
523 f255_final_reduce(x2);
524
525 /*
526 * Encode the final x2 value in little-endian. We first assemble
527 * the limbs into 64-bit values.
528 */
529 x2[0] |= x2[1] << 51;
530 x2[1] = (x2[1] >> 13) | (x2[2] << 38);
531 x2[2] = (x2[2] >> 26) | (x2[3] << 25);
532 x2[3] = (x2[3] >> 39) | (x2[4] << 12);
533 br_enc64le(G, x2[0]);
534 br_enc64le(G + 8, x2[1]);
535 br_enc64le(G + 16, x2[2]);
536 br_enc64le(G + 24, x2[3]);
537 return 1;
538 }
539
540 static size_t
541 api_mulgen(unsigned char *R,
542 const unsigned char *x, size_t xlen, int curve)
543 {
544 const unsigned char *G;
545 size_t Glen;
546
547 G = api_generator(curve, &Glen);
548 memcpy(R, G, Glen);
549 api_mul(R, Glen, x, xlen, curve);
550 return Glen;
551 }
552
553 static uint32_t
554 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
555 const unsigned char *x, size_t xlen,
556 const unsigned char *y, size_t ylen, int curve)
557 {
558 /*
559 * We don't implement this method, since it is used for ECDSA
560 * only, and there is no ECDSA over Curve25519 (which instead
561 * uses EdDSA).
562 */
563 (void)A;
564 (void)B;
565 (void)len;
566 (void)x;
567 (void)xlen;
568 (void)y;
569 (void)ylen;
570 (void)curve;
571 return 0;
572 }
573
574 /* see bearssl_ec.h */
575 const br_ec_impl br_ec_c25519_m62 = {
576 (uint32_t)0x20000000,
577 &api_generator,
578 &api_order,
579 &api_xoff,
580 &api_mul,
581 &api_mulgen,
582 &api_muladd
583 };
584
585 /* see bearssl_ec.h */
586 const br_ec_impl *
587 br_ec_c25519_m62_get(void)
588 {
589 return &br_ec_c25519_m62;
590 }
591
592 #else
593
594 /* see bearssl_ec.h */
595 const br_ec_impl *
596 br_ec_c25519_m62_get(void)
597 {
598 return 0;
599 }
600
601 #endif