2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if BR_INT128 || BR_UMUL128
33 static const unsigned char GEN
[] = {
34 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
36 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
37 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
40 static const unsigned char ORDER
[] = {
41 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
47 static const unsigned char *
48 api_generator(int curve
, size_t *len
)
55 static const unsigned char *
56 api_order(int curve
, size_t *len
)
64 api_xoff(int curve
, size_t *len
)
72 * A field element is encoded as four 64-bit integers, in basis 2^63.
73 * Operations return partially reduced values, which may range up to
77 #define MASK63 (((uint64_t)1 << 63) - (uint64_t)1)
80 * Swap two field elements, conditionally on a flag.
83 f255_cswap(uint64_t *a
, uint64_t *b
, uint32_t ctl
)
88 w
= m
& (a
[0] ^ b
[0]); a
[0] ^= w
; b
[0] ^= w
;
89 w
= m
& (a
[1] ^ b
[1]); a
[1] ^= w
; b
[1] ^= w
;
90 w
= m
& (a
[2] ^ b
[2]); a
[2] ^= w
; b
[2] ^= w
;
91 w
= m
& (a
[3] ^ b
[3]); a
[3] ^= w
; b
[3] ^= w
;
95 * Addition in the field.
98 f255_add(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
102 uint64_t t0
, t1
, t2
, t3
, cc
;
105 z
= (unsigned __int128
)a
[0] + (unsigned __int128
)b
[0];
107 z
= (unsigned __int128
)a
[1] + (unsigned __int128
)b
[1] + (z
>> 64);
109 z
= (unsigned __int128
)a
[2] + (unsigned __int128
)b
[2] + (z
>> 64);
111 z
= (unsigned __int128
)a
[3] + (unsigned __int128
)b
[3] + (z
>> 64);
112 t3
= (uint64_t)z
& MASK63
;
113 cc
= (uint64_t)(z
>> 63);
116 * Since operands are at most 2^255+37, the sum is at most
117 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
119 * We use: 2^255 = 19 mod p.
120 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
121 * the result is at most 2^255+37.
123 z
= (unsigned __int128
)t0
+ (unsigned __int128
)(19 * cc
);
125 z
= (unsigned __int128
)t1
+ (z
>> 64);
127 z
= (unsigned __int128
)t2
+ (z
>> 64);
129 d
[3] = t3
+ (uint64_t)(z
>> 64);
133 uint64_t t0
, t1
, t2
, t3
, cc
;
136 k
= _addcarry_u64(0, a
[0], b
[0], &t0
);
137 k
= _addcarry_u64(k
, a
[1], b
[1], &t1
);
138 k
= _addcarry_u64(k
, a
[2], b
[2], &t2
);
139 k
= _addcarry_u64(k
, a
[3], b
[3], &t3
);
140 cc
= (k
<< 1) + (t3
>> 63);
144 * Since operands are at most 2^255+37, the sum is at most
145 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
147 * We use: 2^255 = 19 mod p.
148 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
149 * the result is at most 2^255+37.
151 k
= _addcarry_u64(0, t0
, 19 * cc
, &d
[0]);
152 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
153 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
154 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
161 * On input, limbs must fit on 60 bits each. On output, result is
162 * partially reduced, with max value 2^255+19456; moreover, all
163 * limbs will fit on 51 bits, except the low limb, which may have
164 * value up to 2^51+19455.
167 f255_sub(uint64_t *d
, const uint64_t *a
, const uint64_t *b
)
172 * We compute t = 2^256 - 38 + a - b, which is necessarily
173 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
174 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
175 * on the two upper bits of t (bits 255 and 256).
178 uint64_t t0
, t1
, t2
, t3
, t4
, cc
;
181 z
= (unsigned __int128
)a
[0] - (unsigned __int128
)b
[0] - 38;
183 cc
= -(uint64_t)(z
>> 64);
184 z
= (unsigned __int128
)a
[1] - (unsigned __int128
)b
[1]
185 - (unsigned __int128
)cc
;
187 cc
= -(uint64_t)(z
>> 64);
188 z
= (unsigned __int128
)a
[2] - (unsigned __int128
)b
[2]
189 - (unsigned __int128
)cc
;
191 cc
= -(uint64_t)(z
>> 64);
192 z
= (unsigned __int128
)a
[3] - (unsigned __int128
)b
[3]
193 - (unsigned __int128
)cc
;
195 t4
= 1 + (uint64_t)(z
>> 64);
198 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
199 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
200 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
201 * This guarantees that the result is at most 2^255+37.
203 cc
= (38 & -t4
) + (19 & -(t3
>> 63));
205 z
= (unsigned __int128
)t0
+ (unsigned __int128
)cc
;
207 z
= (unsigned __int128
)t1
+ (z
>> 64);
209 z
= (unsigned __int128
)t2
+ (z
>> 64);
211 d
[3] = t3
+ (uint64_t)(z
>> 64);
216 * We compute t = 2^256 - 38 + a - b, which is necessarily
217 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
218 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
219 * on the two upper bits of t (bits 255 and 256).
222 uint64_t t0
, t1
, t2
, t3
, t4
;
225 k
= _subborrow_u64(0, a
[0], b
[0], &t0
);
226 k
= _subborrow_u64(k
, a
[1], b
[1], &t1
);
227 k
= _subborrow_u64(k
, a
[2], b
[2], &t2
);
228 k
= _subborrow_u64(k
, a
[3], b
[3], &t3
);
229 (void)_subborrow_u64(k
, 1, 0, &t4
);
231 k
= _subborrow_u64(0, t0
, 38, &t0
);
232 k
= _subborrow_u64(k
, t1
, 0, &t1
);
233 k
= _subborrow_u64(k
, t2
, 0, &t2
);
234 k
= _subborrow_u64(k
, t3
, 0, &t3
);
235 (void)_subborrow_u64(k
, t4
, 0, &t4
);
238 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
239 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
240 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
241 * This guarantees that the result is at most 2^255+37.
243 t4
= (38 & -t4
) + (19 & -(t3
>> 63));
245 k
= _addcarry_u64(0, t0
, t4
, &d
[0]);
246 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
247 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
248 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
257 f255_mul(uint64_t *d
, uint64_t *a
, uint64_t *b
)
262 uint64_t t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, th
;
265 * Compute the product a*b over plain integers.
267 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[0];
269 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[1] + (z
>> 64);
271 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[2] + (z
>> 64);
273 z
= (unsigned __int128
)a
[0] * (unsigned __int128
)b
[3] + (z
>> 64);
275 t4
= (uint64_t)(z
>> 64);
277 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[0]
278 + (unsigned __int128
)t1
;
280 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[1]
281 + (unsigned __int128
)t2
+ (z
>> 64);
283 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[2]
284 + (unsigned __int128
)t3
+ (z
>> 64);
286 z
= (unsigned __int128
)a
[1] * (unsigned __int128
)b
[3]
287 + (unsigned __int128
)t4
+ (z
>> 64);
289 t5
= (uint64_t)(z
>> 64);
291 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[0]
292 + (unsigned __int128
)t2
;
294 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[1]
295 + (unsigned __int128
)t3
+ (z
>> 64);
297 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[2]
298 + (unsigned __int128
)t4
+ (z
>> 64);
300 z
= (unsigned __int128
)a
[2] * (unsigned __int128
)b
[3]
301 + (unsigned __int128
)t5
+ (z
>> 64);
303 t6
= (uint64_t)(z
>> 64);
305 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[0]
306 + (unsigned __int128
)t3
;
308 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[1]
309 + (unsigned __int128
)t4
+ (z
>> 64);
311 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[2]
312 + (unsigned __int128
)t5
+ (z
>> 64);
314 z
= (unsigned __int128
)a
[3] * (unsigned __int128
)b
[3]
315 + (unsigned __int128
)t6
+ (z
>> 64);
317 t7
= (uint64_t)(z
>> 64);
323 * 2^510 = 19*19 = 361
325 * We split the intermediate t into three parts, in basis
326 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
327 * The upper one can only be a single bit (th), since the
328 * multiplication operands are at most 2^255+37 each.
331 t7
= ((t7
<< 1) | (t6
>> 63)) & MASK63
;
332 t6
= (t6
<< 1) | (t5
>> 63);
333 t5
= (t5
<< 1) | (t4
>> 63);
334 t4
= (t4
<< 1) | (t3
>> 63);
338 * Multiply the middle part (t4..t7) by 19. We truncate it to
339 * 255 bits; the extra bits will go along with th.
341 z
= (unsigned __int128
)t4
* 19;
343 z
= (unsigned __int128
)t5
* 19 + (z
>> 64);
345 z
= (unsigned __int128
)t6
* 19 + (z
>> 64);
347 z
= (unsigned __int128
)t7
* 19 + (z
>> 64);
348 t7
= (uint64_t)z
& MASK63
;
350 th
= (361 & -th
) + (19 * (uint64_t)(z
>> 63));
353 * Add elements together.
355 * t0..t3 fits on 255 bits.
356 * t4..t7 fits on 255 bits.
357 * th <= 361 + 342 = 703.
359 z
= (unsigned __int128
)t0
+ (unsigned __int128
)t4
360 + (unsigned __int128
)th
;
362 z
= (unsigned __int128
)t1
+ (unsigned __int128
)t5
+ (z
>> 64);
364 z
= (unsigned __int128
)t2
+ (unsigned __int128
)t6
+ (z
>> 64);
366 z
= (unsigned __int128
)t3
+ (unsigned __int128
)t7
+ (z
>> 64);
367 t3
= (uint64_t)z
& MASK63
;
368 th
= (uint64_t)(z
>> 63);
371 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
372 * can only have value 0, 1 or 2. We just add th*19, which
373 * guarantees a result of at most 2^255+37.
375 z
= (unsigned __int128
)t0
+ (19 * th
);
377 z
= (unsigned __int128
)t1
+ (z
>> 64);
379 z
= (unsigned __int128
)t2
+ (z
>> 64);
381 d
[3] = t3
+ (uint64_t)(z
>> 64);
385 uint64_t t0
, t1
, t2
, t3
, t4
, t5
, t6
, t7
, th
;
386 uint64_t h0
, h1
, h2
, h3
;
390 * Compute the product a*b over plain integers.
392 t0
= _umul128(a
[0], b
[0], &h0
);
393 t1
= _umul128(a
[0], b
[1], &h1
);
394 k
= _addcarry_u64(0, t1
, h0
, &t1
);
395 t2
= _umul128(a
[0], b
[2], &h2
);
396 k
= _addcarry_u64(k
, t2
, h1
, &t2
);
397 t3
= _umul128(a
[0], b
[3], &h3
);
398 k
= _addcarry_u64(k
, t3
, h2
, &t3
);
399 (void)_addcarry_u64(k
, h3
, 0, &t4
);
401 k
= _addcarry_u64(0, _umul128(a
[1], b
[0], &h0
), t1
, &t1
);
402 k
= _addcarry_u64(k
, _umul128(a
[1], b
[1], &h1
), t2
, &t2
);
403 k
= _addcarry_u64(k
, _umul128(a
[1], b
[2], &h2
), t3
, &t3
);
404 k
= _addcarry_u64(k
, _umul128(a
[1], b
[3], &h3
), t4
, &t4
);
406 k
= _addcarry_u64(0, t2
, h0
, &t2
);
407 k
= _addcarry_u64(k
, t3
, h1
, &t3
);
408 k
= _addcarry_u64(k
, t4
, h2
, &t4
);
409 (void)_addcarry_u64(k
, t5
, h3
, &t5
);
411 k
= _addcarry_u64(0, _umul128(a
[2], b
[0], &h0
), t2
, &t2
);
412 k
= _addcarry_u64(k
, _umul128(a
[2], b
[1], &h1
), t3
, &t3
);
413 k
= _addcarry_u64(k
, _umul128(a
[2], b
[2], &h2
), t4
, &t4
);
414 k
= _addcarry_u64(k
, _umul128(a
[2], b
[3], &h3
), t5
, &t5
);
416 k
= _addcarry_u64(0, t3
, h0
, &t3
);
417 k
= _addcarry_u64(k
, t4
, h1
, &t4
);
418 k
= _addcarry_u64(k
, t5
, h2
, &t5
);
419 (void)_addcarry_u64(k
, t6
, h3
, &t6
);
421 k
= _addcarry_u64(0, _umul128(a
[3], b
[0], &h0
), t3
, &t3
);
422 k
= _addcarry_u64(k
, _umul128(a
[3], b
[1], &h1
), t4
, &t4
);
423 k
= _addcarry_u64(k
, _umul128(a
[3], b
[2], &h2
), t5
, &t5
);
424 k
= _addcarry_u64(k
, _umul128(a
[3], b
[3], &h3
), t6
, &t6
);
426 k
= _addcarry_u64(0, t4
, h0
, &t4
);
427 k
= _addcarry_u64(k
, t5
, h1
, &t5
);
428 k
= _addcarry_u64(k
, t6
, h2
, &t6
);
429 (void)_addcarry_u64(k
, t7
, h3
, &t7
);
435 * 2^510 = 19*19 = 361
437 * We split the intermediate t into three parts, in basis
438 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
439 * The upper one can only be a single bit (th), since the
440 * multiplication operands are at most 2^255+37 each.
443 t7
= ((t7
<< 1) | (t6
>> 63)) & MASK63
;
444 t6
= (t6
<< 1) | (t5
>> 63);
445 t5
= (t5
<< 1) | (t4
>> 63);
446 t4
= (t4
<< 1) | (t3
>> 63);
450 * Multiply the middle part (t4..t7) by 19. We truncate it to
451 * 255 bits; the extra bits will go along with th.
453 t4
= _umul128(t4
, 19, &h0
);
454 t5
= _umul128(t5
, 19, &h1
);
455 t6
= _umul128(t6
, 19, &h2
);
456 t7
= _umul128(t7
, 19, &h3
);
457 k
= _addcarry_u64(0, t5
, h0
, &t5
);
458 k
= _addcarry_u64(k
, t6
, h1
, &t6
);
459 k
= _addcarry_u64(k
, t7
, h2
, &t7
);
460 (void)_addcarry_u64(k
, h3
, 0, &h3
);
461 th
= (361 & -th
) + (19 * ((h3
<< 1) + (t7
>> 63)));
465 * Add elements together.
467 * t0..t3 fits on 255 bits.
468 * t4..t7 fits on 255 bits.
469 * th <= 361 + 342 = 703.
471 k
= _addcarry_u64(0, t0
, t4
, &t0
);
472 k
= _addcarry_u64(k
, t1
, t5
, &t1
);
473 k
= _addcarry_u64(k
, t2
, t6
, &t2
);
474 k
= _addcarry_u64(k
, t3
, t7
, &t3
);
476 k
= _addcarry_u64(0, t0
, th
, &t0
);
477 k
= _addcarry_u64(k
, t1
, 0, &t1
);
478 k
= _addcarry_u64(k
, t2
, 0, &t2
);
479 k
= _addcarry_u64(k
, t3
, 0, &t3
);
480 (void)_addcarry_u64(k
, t4
, 0, &t4
);
482 th
= (t4
<< 1) + (t3
>> 63);
486 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
487 * can only have value 0, 1 or 2. We just add th*19, which
488 * guarantees a result of at most 2^255+37.
490 k
= _addcarry_u64(0, t0
, 19 * th
, &d
[0]);
491 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
492 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
493 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
499 * Multiplication by A24 = 121665.
502 f255_mul_a24(uint64_t *d
, const uint64_t *a
)
506 uint64_t t0
, t1
, t2
, t3
;
509 z
= (unsigned __int128
)a
[0] * 121665;
511 z
= (unsigned __int128
)a
[1] * 121665 + (z
>> 64);
513 z
= (unsigned __int128
)a
[2] * 121665 + (z
>> 64);
515 z
= (unsigned __int128
)a
[3] * 121665 + (z
>> 64);
516 t3
= (uint64_t)z
& MASK63
;
518 z
= (unsigned __int128
)t0
+ (19 * (uint64_t)(z
>> 63));
520 z
= (unsigned __int128
)t1
+ (z
>> 64);
522 z
= (unsigned __int128
)t2
+ (z
>> 64);
524 t3
= t3
+ (uint64_t)(z
>> 64);
526 z
= (unsigned __int128
)t0
+ (19 & -(t3
>> 63));
528 z
= (unsigned __int128
)t1
+ (z
>> 64);
530 z
= (unsigned __int128
)t2
+ (z
>> 64);
532 d
[3] = (t3
& MASK63
) + (uint64_t)(z
>> 64);
536 uint64_t t0
, t1
, t2
, t3
, t4
, h0
, h1
, h2
, h3
;
539 t0
= _umul128(a
[0], 121665, &h0
);
540 t1
= _umul128(a
[1], 121665, &h1
);
541 k
= _addcarry_u64(0, t1
, h0
, &t1
);
542 t2
= _umul128(a
[2], 121665, &h2
);
543 k
= _addcarry_u64(k
, t2
, h1
, &t2
);
544 t3
= _umul128(a
[3], 121665, &h3
);
545 k
= _addcarry_u64(k
, t3
, h2
, &t3
);
546 (void)_addcarry_u64(k
, h3
, 0, &t4
);
548 t4
= (t4
<< 1) + (t3
>> 63);
550 k
= _addcarry_u64(0, t0
, 19 * t4
, &t0
);
551 k
= _addcarry_u64(k
, t1
, 0, &t1
);
552 k
= _addcarry_u64(k
, t2
, 0, &t2
);
553 (void)_addcarry_u64(k
, t3
, 0, &t3
);
555 t4
= 19 & -(t3
>> 63);
557 k
= _addcarry_u64(0, t0
, t4
, &d
[0]);
558 k
= _addcarry_u64(k
, t1
, 0, &d
[1]);
559 k
= _addcarry_u64(k
, t2
, 0, &d
[2]);
560 (void)_addcarry_u64(k
, t3
, 0, &d
[3]);
566 * Finalize reduction.
569 f255_final_reduce(uint64_t *a
)
573 uint64_t t0
, t1
, t2
, t3
, m
;
577 * We add 19. If the result (in t) is below 2^255, then a[]
578 * is already less than 2^255-19, thus already reduced.
579 * Otherwise, we subtract 2^255 from t[], in which case we
580 * have t = a - (2^255-19), and that's our result.
582 z
= (unsigned __int128
)a
[0] + 19;
584 z
= (unsigned __int128
)a
[1] + (z
>> 64);
586 z
= (unsigned __int128
)a
[2] + (z
>> 64);
588 t3
= a
[3] + (uint64_t)(z
>> 64);
592 a
[0] ^= m
& (a
[0] ^ t0
);
593 a
[1] ^= m
& (a
[1] ^ t1
);
594 a
[2] ^= m
& (a
[2] ^ t2
);
595 a
[3] ^= m
& (a
[3] ^ t3
);
599 uint64_t t0
, t1
, t2
, t3
, m
;
603 * We add 19. If the result (in t) is below 2^255, then a[]
604 * is already less than 2^255-19, thus already reduced.
605 * Otherwise, we subtract 2^255 from t[], in which case we
606 * have t = a - (2^255-19), and that's our result.
608 k
= _addcarry_u64(0, a
[0], 19, &t0
);
609 k
= _addcarry_u64(k
, a
[1], 0, &t1
);
610 k
= _addcarry_u64(k
, a
[2], 0, &t2
);
611 (void)_addcarry_u64(k
, a
[3], 0, &t3
);
615 a
[0] ^= m
& (a
[0] ^ t0
);
616 a
[1] ^= m
& (a
[1] ^ t1
);
617 a
[2] ^= m
& (a
[2] ^ t2
);
618 a
[3] ^= m
& (a
[3] ^ t3
);
624 api_mul(unsigned char *G
, size_t Glen
,
625 const unsigned char *kb
, size_t kblen
, int curve
)
628 uint64_t x1
[4], x2
[4], z2
[4], x3
[4], z3
[4];
635 * Points are encoded over exactly 32 bytes. Multipliers must fit
636 * in 32 bytes as well.
638 if (Glen
!= 32 || kblen
> 32) {
643 * RFC 7748 mandates that the high bit of the last point byte must
644 * be ignored/cleared.
646 x1
[0] = br_dec64le(&G
[ 0]);
647 x1
[1] = br_dec64le(&G
[ 8]);
648 x1
[2] = br_dec64le(&G
[16]);
649 x1
[3] = br_dec64le(&G
[24]) & MASK63
;
652 * We can use memset() to clear values, because exact-width types
653 * like uint64_t are guaranteed to have no padding bits or
654 * trap representations.
656 memset(x2
, 0, sizeof x2
);
658 memset(z2
, 0, sizeof z2
);
659 memcpy(x3
, x1
, sizeof x1
);
660 memcpy(z3
, x2
, sizeof x2
);
663 * The multiplier is provided in big-endian notation, and
664 * possibly shorter than 32 bytes.
666 memset(k
, 0, (sizeof k
) - kblen
);
667 memcpy(k
+ (sizeof k
) - kblen
, kb
, kblen
);
674 for (i
= 254; i
>= 0; i
--) {
675 uint64_t a
[4], aa
[4], b
[4], bb
[4], e
[4];
676 uint64_t c
[4], d
[4], da
[4], cb
[4];
679 kt
= (k
[31 - (i
>> 3)] >> (i
& 7)) & 1;
681 f255_cswap(x2
, x3
, swap
);
682 f255_cswap(z2
, z3
, swap
);
712 /* x_3 = (DA + CB)^2 */
713 f255_add(x3
, da
, cb
);
714 f255_mul(x3
, x3
, x3
);
716 /* z_3 = x_1 * (DA - CB)^2 */
717 f255_sub(z3
, da
, cb
);
718 f255_mul(z3
, z3
, z3
);
719 f255_mul(z3
, x1
, z3
);
722 f255_mul(x2
, aa
, bb
);
724 /* z_2 = E * (AA + a24 * E) */
726 f255_add(z2
, aa
, z2
);
730 f255_cswap(x2
, x3
, swap
);
731 f255_cswap(z2
, z3
, swap
);
734 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
735 * most non-squarings. We use x1 and x3, now useless, as temporaries.
737 memcpy(x1
, z2
, sizeof z2
);
738 for (i
= 0; i
< 15; i
++) {
739 f255_mul(x1
, x1
, x1
);
740 f255_mul(x1
, x1
, z2
);
742 memcpy(x3
, x1
, sizeof x1
);
743 for (i
= 0; i
< 14; i
++) {
746 for (j
= 0; j
< 16; j
++) {
747 f255_mul(x3
, x3
, x3
);
749 f255_mul(x3
, x3
, x1
);
751 for (i
= 14; i
>= 0; i
--) {
752 f255_mul(x3
, x3
, x3
);
753 if ((0xFFEB >> i
) & 1) {
754 f255_mul(x3
, z2
, x3
);
759 * Compute x2/z2. We have 1/z2 in x3.
761 f255_mul(x2
, x2
, x3
);
762 f255_final_reduce(x2
);
765 * Encode the final x2 value in little-endian.
767 br_enc64le(G
, x2
[0]);
768 br_enc64le(G
+ 8, x2
[1]);
769 br_enc64le(G
+ 16, x2
[2]);
770 br_enc64le(G
+ 24, x2
[3]);
775 api_mulgen(unsigned char *R
,
776 const unsigned char *x
, size_t xlen
, int curve
)
778 const unsigned char *G
;
781 G
= api_generator(curve
, &Glen
);
783 api_mul(R
, Glen
, x
, xlen
, curve
);
788 api_muladd(unsigned char *A
, const unsigned char *B
, size_t len
,
789 const unsigned char *x
, size_t xlen
,
790 const unsigned char *y
, size_t ylen
, int curve
)
793 * We don't implement this method, since it is used for ECDSA
794 * only, and there is no ECDSA over Curve25519 (which instead
808 /* see bearssl_ec.h */
809 const br_ec_impl br_ec_c25519_m64
= {
810 (uint32_t)0x20000000,
819 /* see bearssl_ec.h */
821 br_ec_c25519_m64_get(void)
823 return &br_ec_c25519_m64
;
828 /* see bearssl_ec.h */
830 br_ec_c25519_m64_get(void)