2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 print_int(const char *name, const uint32_t *x)
34 unsigned char tmp[40];
36 printf("%s = ", name);
37 for (u = 0; u < 9; u ++) {
38 if (x[u] > 0x3FFFFFFF) {
40 for (u = 0; u < 9; u ++) {
41 printf(" %08X", x[u]);
47 memset(tmp, 0, sizeof tmp);
48 for (u = 0; u < 9; u ++) {
60 for (j = 0; j < 8; j ++) {
61 tmp[39 - k - j] |= (unsigned char)w;
65 for (u = 8; u < 40; u ++) {
66 printf("%02X", tmp[u]);
73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74 * that right-shifting a signed negative integer copies the sign bit
75 * (arithmetic right-shift). This is "implementation-defined behaviour",
76 * i.e. it is not undefined, but it may differ between compilers. Each
77 * compiler is supposed to document its behaviour in that respect. GCC
78 * explicitly defines that an arithmetic right shift is used. We expect
79 * all other compilers to do the same, because underlying CPU offer an
80 * arithmetic right shift opcode that could not be used otherwise.
83 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
84 | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
86 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
90 * Convert an integer from unsigned little-endian encoding to a sequence of
91 * 30-bit words in little-endian order. The final "partial" word is
95 le8_to_le30(uint32_t *dst
, const unsigned char *src
, size_t len
)
110 *dst
++ = (acc
| (b
<< acc_len
)) & 0x3FFFFFFF;
111 acc
= b
>> (30 - acc_len
);
119 * Convert an integer (30-bit words, little-endian) to unsigned
120 * little-endian encoding. The total encoding length is provided; all
121 * the destination bytes will be filled.
124 le30_to_le8(unsigned char *dst
, size_t len
, const uint32_t *src
)
136 *dst
++ = (unsigned char)(acc
| (w
<< acc_len
));
137 acc
= w
>> (8 - acc_len
);
140 *dst
++ = (unsigned char)acc
;
148 * Multiply two integers. Source integers are represented as arrays of
149 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150 * 18 words of 30 bits each.
153 mul9(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
156 * Maximum intermediate result is no more than
157 * 10376293531797946367, which fits in 64 bits. Reason:
159 * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160 * 10376293531797946367 < 9663676407 * 2^30
162 * Thus, adding together 9 products of 30-bit integers, with
163 * a carry of at most 9663676406, yields an integer that fits
164 * on 64 bits and generates a carry of at most 9663676406.
170 t
[ 0] = MUL31(a
[0], b
[0]);
171 t
[ 1] = MUL31(a
[0], b
[1])
173 t
[ 2] = MUL31(a
[0], b
[2])
176 t
[ 3] = MUL31(a
[0], b
[3])
180 t
[ 4] = MUL31(a
[0], b
[4])
185 t
[ 5] = MUL31(a
[0], b
[5])
191 t
[ 6] = MUL31(a
[0], b
[6])
198 t
[ 7] = MUL31(a
[0], b
[7])
206 t
[ 8] = MUL31(a
[0], b
[8])
215 t
[ 9] = MUL31(a
[1], b
[8])
223 t
[10] = MUL31(a
[2], b
[8])
230 t
[11] = MUL31(a
[3], b
[8])
236 t
[12] = MUL31(a
[4], b
[8])
241 t
[13] = MUL31(a
[5], b
[8])
245 t
[14] = MUL31(a
[6], b
[8])
248 t
[15] = MUL31(a
[7], b
[8])
250 t
[16] = MUL31(a
[8], b
[8]);
256 for (i
= 0; i
< 17; i
++) {
260 d
[i
] = (uint32_t)w
& 0x3FFFFFFF;
263 d
[17] = (uint32_t)cc
;
267 * Square a 270-bit integer, represented as an array of nine 30-bit words.
268 * Result uses 18 words of 30 bits each.
271 square9(uint32_t *d
, const uint32_t *a
)
277 t
[ 0] = MUL31(a
[0], a
[0]);
278 t
[ 1] = ((MUL31(a
[0], a
[1])) << 1);
279 t
[ 2] = MUL31(a
[1], a
[1])
280 + ((MUL31(a
[0], a
[2])) << 1);
281 t
[ 3] = ((MUL31(a
[0], a
[3])
282 + MUL31(a
[1], a
[2])) << 1);
283 t
[ 4] = MUL31(a
[2], a
[2])
284 + ((MUL31(a
[0], a
[4])
285 + MUL31(a
[1], a
[3])) << 1);
286 t
[ 5] = ((MUL31(a
[0], a
[5])
288 + MUL31(a
[2], a
[3])) << 1);
289 t
[ 6] = MUL31(a
[3], a
[3])
290 + ((MUL31(a
[0], a
[6])
292 + MUL31(a
[2], a
[4])) << 1);
293 t
[ 7] = ((MUL31(a
[0], a
[7])
296 + MUL31(a
[3], a
[4])) << 1);
297 t
[ 8] = MUL31(a
[4], a
[4])
298 + ((MUL31(a
[0], a
[8])
301 + MUL31(a
[3], a
[5])) << 1);
302 t
[ 9] = ((MUL31(a
[1], a
[8])
305 + MUL31(a
[4], a
[5])) << 1);
306 t
[10] = MUL31(a
[5], a
[5])
307 + ((MUL31(a
[2], a
[8])
309 + MUL31(a
[4], a
[6])) << 1);
310 t
[11] = ((MUL31(a
[3], a
[8])
312 + MUL31(a
[5], a
[6])) << 1);
313 t
[12] = MUL31(a
[6], a
[6])
314 + ((MUL31(a
[4], a
[8])
315 + MUL31(a
[5], a
[7])) << 1);
316 t
[13] = ((MUL31(a
[5], a
[8])
317 + MUL31(a
[6], a
[7])) << 1);
318 t
[14] = MUL31(a
[7], a
[7])
319 + ((MUL31(a
[6], a
[8])) << 1);
320 t
[15] = ((MUL31(a
[7], a
[8])) << 1);
321 t
[16] = MUL31(a
[8], a
[8]);
327 for (i
= 0; i
< 17; i
++) {
331 d
[i
] = (uint32_t)w
& 0x3FFFFFFF;
334 d
[17] = (uint32_t)cc
;
338 * Perform a "final reduction" in field F255 (field for Curve25519)
339 * The source value must be less than twice the modulus. If the value
340 * is not lower than the modulus, then the modulus is subtracted and
341 * this function returns 1; otherwise, it leaves it untouched and it
345 reduce_final_f255(uint32_t *d
)
351 memcpy(t
, d
, sizeof t
);
353 for (i
= 0; i
< 9; i
++) {
358 t
[i
] = w
& 0x3FFFFFFF;
362 CCOPY(cc
, d
, t
, sizeof t
);
367 * Perform a multiplication of two integers modulo 2^255-19.
368 * Operands are arrays of 9 words, each containing 30 bits of data, in
369 * little-endian order. Input value may be up to 2^256-1; on output, value
370 * fits on 256 bits and is lower than twice the modulus.
373 f255_mul(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
380 * Compute raw multiplication. All result words fit in 30 bits
381 * each; upper word (t[17]) must fit on 2 bits, since the product
382 * of two 256-bit integers must fit on 512 bits.
387 * Modular reduction: each high word is added where necessary.
388 * Since the modulus is 2^255-19 and word 9 corresponds to
389 * offset 9*30 = 270, word 9+k must be added to word k with
390 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
393 cc
= MUL31(t
[8] >> 15, 19);
395 for (i
= 0; i
< 9; i
++) {
396 w
= (uint64_t)t
[i
] + cc
+ MUL31(t
[i
+ 9], 622592);
397 t
[i
] = (uint32_t)w
& 0x3FFFFFFF;
400 cc
= MUL31(w
>> 15, 19);
402 for (i
= 0; i
< 9; i
++) {
404 d
[i
] = (uint32_t)w
& 0x3FFFFFFF;
410 * Perform a squaring of an integer modulo 2^255-19.
411 * Operands are arrays of 9 words, each containing 30 bits of data, in
412 * little-endian order. Input value may be up to 2^256-1; on output, value
413 * fits on 256 bits and is lower than twice the modulus.
416 f255_square(uint32_t *d
, const uint32_t *a
)
423 * Compute raw squaring. All result words fit in 30 bits
424 * each; upper word (t[17]) must fit on 2 bits, since the square
425 * of a 256-bit integers must fit on 512 bits.
430 * Modular reduction: each high word is added where necessary.
431 * Since the modulus is 2^255-19 and word 9 corresponds to
432 * offset 9*30 = 270, word 9+k must be added to word k with
433 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
436 cc
= MUL31(t
[8] >> 15, 19);
438 for (i
= 0; i
< 9; i
++) {
439 w
= (uint64_t)t
[i
] + cc
+ MUL31(t
[i
+ 9], 622592);
440 t
[i
] = (uint32_t)w
& 0x3FFFFFFF;
443 cc
= MUL31(w
>> 15, 19);
445 for (i
= 0; i
< 9; i
++) {
447 d
[i
] = (uint32_t)w
& 0x3FFFFFFF;
453 * Add two values in F255. Partial reduction is performed (down to less
454 * than twice the modulus).
457 f255_add(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
460 * Since operand words fit on 30 bits, we can use 32-bit
461 * variables throughout.
467 for (i
= 0; i
< 9; i
++) {
468 w
= a
[i
] + b
[i
] + cc
;
469 d
[i
] = w
& 0x3FFFFFFF;
472 cc
= MUL15(w
>> 15, 19);
474 for (i
= 0; i
< 9; i
++) {
476 d
[i
] = w
& 0x3FFFFFFF;
482 * Subtract one value from another in F255. Partial reduction is
483 * performed (down to less than twice the modulus).
486 f255_sub(uint32_t *d
, const uint32_t *a
, const uint32_t *b
)
489 * We actually compute a - b + 2*p, so that the final value is
490 * necessarily positive.
496 for (i
= 0; i
< 9; i
++) {
497 w
= a
[i
] - b
[i
] + cc
;
498 d
[i
] = w
& 0x3FFFFFFF;
501 cc
= MUL15((w
+ 0x10000) >> 15, 19);
503 for (i
= 0; i
< 9; i
++) {
505 d
[i
] = w
& 0x3FFFFFFF;
511 * Multiply an integer by the 'A24' constant (121665). Partial reduction
512 * is performed (down to less than twice the modulus).
515 f255_mul_a24(uint32_t *d
, const uint32_t *a
)
521 for (i
= 0; i
< 9; i
++) {
522 w
= MUL31(a
[i
], 121665) + cc
;
523 d
[i
] = (uint32_t)w
& 0x3FFFFFFF;
526 cc
= MUL31((uint32_t)(w
>> 15), 19);
528 for (i
= 0; i
< 9; i
++) {
529 w
= (uint64_t)d
[i
] + cc
;
530 d
[i
] = w
& 0x3FFFFFFF;
535 static const unsigned char GEN
[] = {
536 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
537 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
538 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
539 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
542 static const unsigned char ORDER
[] = {
543 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
544 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
545 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
546 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
549 static const unsigned char *
550 api_generator(int curve
, size_t *len
)
557 static const unsigned char *
558 api_order(int curve
, size_t *len
)
566 api_xoff(int curve
, size_t *len
)
574 cswap(uint32_t *a
, uint32_t *b
, uint32_t ctl
)
579 for (i
= 0; i
< 9; i
++) {
584 tw
= ctl
& (aw
^ bw
);
591 api_mul(unsigned char *G
, size_t Glen
,
592 const unsigned char *kb
, size_t kblen
, int curve
)
594 uint32_t x1
[9], x2
[9], x3
[9], z2
[9], z3
[9];
595 uint32_t a
[9], aa
[9], b
[9], bb
[9];
596 uint32_t c
[9], d
[9], e
[9], da
[9], cb
[9];
604 * Points are encoded over exactly 32 bytes. Multipliers must fit
605 * in 32 bytes as well.
606 * RFC 7748 mandates that the high bit of the last point byte must
607 * be ignored/cleared.
609 if (Glen
!= 32 || kblen
> 32) {
615 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
616 * into Montgomery representation.
618 x1
[8] = le8_to_le30(x1
, G
, 32);
619 memcpy(x3
, x1
, sizeof x1
);
620 memset(z2
, 0, sizeof z2
);
621 memset(x2
, 0, sizeof x2
);
623 memset(z3
, 0, sizeof z3
);
626 memcpy(k
, kb
, kblen
);
627 memset(k
+ kblen
, 0, (sizeof k
) - kblen
);
637 for (i
= 254; i
>= 0; i
--) {
640 kt
= (k
[i
>> 3] >> (i
& 7)) & 1;
675 f255_add(x3
, da
, cb
);
677 f255_sub(z3
, da
, cb
);
679 f255_mul(z3
, z3
, x1
);
680 f255_mul(x2
, aa
, bb
);
682 f255_add(z2
, z2
, aa
);
696 * Inverse z2 with a modular exponentiation. This is a simple
697 * square-and-multiply algorithm; we mutualise most non-squarings
698 * since the exponent contains almost only ones.
700 memcpy(a
, z2
, sizeof z2
);
701 for (i
= 0; i
< 15; i
++) {
705 memcpy(b
, a
, sizeof a
);
706 for (i
= 0; i
< 14; i
++) {
709 for (j
= 0; j
< 16; j
++) {
714 for (i
= 14; i
>= 0; i
--) {
716 if ((0xFFEB >> i
) & 1) {
721 reduce_final_f255(x2
);
722 le30_to_le8(G
, 32, x2
);
727 api_mulgen(unsigned char *R
,
728 const unsigned char *x
, size_t xlen
, int curve
)
730 const unsigned char *G
;
733 G
= api_generator(curve
, &Glen
);
735 api_mul(R
, Glen
, x
, xlen
, curve
);
740 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
741 const unsigned char *x
, size_t xlen
,
742 const unsigned char *y
, size_t ylen
, int curve
)
745 * We don't implement this method, since it is used for ECDSA
746 * only, and there is no ECDSA over Curve25519 (which instead
760 /* see bearssl_ec.h */
761 const br_ec_impl br_ec_c25519_m31
= {
762 (uint32_t)0x20000000,