2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * In this file, we handle big integers with a custom format, i.e.
29 * without the usual one-word header. Value is split into 31-bit words,
30 * each stored in a 32-bit slot (top bit is zero) in little-endian
31 * order. The length (in words) is provided explicitly. In some cases,
32 * the value can be negative (using two's complement representation). In
33 * some cases, the top word is allowed to have a 32th bit.
37 * Negate big integer conditionally. The value consists of 'len' words,
38 * with 31 bits in each word (the top bit of each word should be 0,
39 * except possibly for the last word). If 'ctl' is 1, the negation is
40 * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
43 cond_negate(uint32_t *a
, size_t len
, uint32_t ctl
)
50 for (k
= 0; k
< len
; k
++) {
55 a
[k
] = aw
& 0x7FFFFFFF;
61 * Finish modular reduction. Rules on input parameters:
63 * if neg = 1, then -m <= a < 0
64 * if neg = 0, then 0 <= a < 2*m
66 * If neg = 0, then the top word of a[] may use 32 bits.
68 * Also, modulus m must be odd.
71 finish_mod(uint32_t *a
, size_t len
, const uint32_t *m
, uint32_t neg
)
77 * First pass: compare a (assumed nonnegative) with m.
78 * Note that if the final word uses the top extra bit, then
79 * subtracting m must yield a value less than 2^31, since we
80 * assumed that a < 2*m.
83 for (k
= 0; k
< len
; k
++) {
88 cc
= (aw
- mw
- cc
) >> 31;
93 * if neg = 1, then we must add m (regardless of cc)
94 * if neg = 0 and cc = 0, then we must subtract m
95 * if neg = 0 and cc = 1, then we must do nothing
98 ym
= -(neg
| (1 - cc
));
100 for (k
= 0; k
< len
; k
++) {
104 mw
= (m
[k
] ^ xm
) & ym
;
106 a
[k
] = aw
& 0x7FFFFFFF;
113 * a <- (a*pa+b*pb)/(2^31)
114 * b <- (a*qa+b*qb)/(2^31)
115 * The division is assumed to be exact (i.e. the low word is dropped).
116 * If the final a is negative, then it is negated. Similarly for b.
117 * Returned value is the combination of two bits:
118 * bit 0: 1 if a had to be negated, 0 otherwise
119 * bit 1: 1 if b had to be negated, 0 otherwise
121 * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
122 * Source integers a and b must be nonnegative; top word is not allowed
123 * to contain an extra 32th bit.
126 co_reduce(uint32_t *a
, uint32_t *b
, size_t len
,
127 int64_t pa
, int64_t pb
, int64_t qa
, int64_t qb
)
135 for (k
= 0; k
< len
; k
++) {
144 * 0 <= wa <= 2^31 - 1
145 * 0 <= wb <= 2^31 - 1
148 * |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1
150 * Thus, the new value of cca is such that |cca| <= 2^32 - 1.
151 * The same applies to ccb.
155 za
= wa
* (uint64_t)pa
+ wb
* (uint64_t)pb
+ (uint64_t)cca
;
156 zb
= wa
* (uint64_t)qa
+ wb
* (uint64_t)qb
+ (uint64_t)ccb
;
158 a
[k
- 1] = za
& 0x7FFFFFFF;
159 b
[k
- 1] = zb
& 0x7FFFFFFF;
163 * For the new values of cca and ccb, we need a signed
164 * right-shift; since, in C, right-shifting a signed
165 * negative value is implementation-defined, we use a
166 * custom portable sign extension expression.
168 #define M ((uint64_t)1 << 32)
173 cca
= *(int64_t *)&tta
;
174 ccb
= *(int64_t *)&ttb
;
177 a
[len
- 1] = (uint32_t)cca
;
178 b
[len
- 1] = (uint32_t)ccb
;
180 nega
= (uint32_t)((uint64_t)cca
>> 63);
181 negb
= (uint32_t)((uint64_t)ccb
>> 63);
182 cond_negate(a
, len
, nega
);
183 cond_negate(b
, len
, negb
);
184 return nega
| (negb
<< 1);
189 * a <- (a*pa+b*pb)/(2^31) mod m
190 * b <- (a*qa+b*qb)/(2^31) mod m
192 * m0i is equal to -1/m[0] mod 2^31.
194 * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
195 * Source integers a and b must be nonnegative; top word is not allowed
196 * to contain an extra 32th bit.
199 co_reduce_mod(uint32_t *a
, uint32_t *b
, size_t len
,
200 int64_t pa
, int64_t pb
, int64_t qa
, int64_t qb
,
201 const uint32_t *m
, uint32_t m0i
)
209 fa
= ((a
[0] * (uint32_t)pa
+ b
[0] * (uint32_t)pb
) * m0i
) & 0x7FFFFFFF;
210 fb
= ((a
[0] * (uint32_t)qa
+ b
[0] * (uint32_t)qb
) * m0i
) & 0x7FFFFFFF;
211 for (k
= 0; k
< len
; k
++) {
217 * In this loop, carries 'cca' and 'ccb' always fit on
218 * 33 bits (in absolute value).
222 za
= wa
* (uint64_t)pa
+ wb
* (uint64_t)pb
223 + m
[k
] * (uint64_t)fa
+ (uint64_t)cca
;
224 zb
= wa
* (uint64_t)qa
+ wb
* (uint64_t)qb
225 + m
[k
] * (uint64_t)fb
+ (uint64_t)ccb
;
227 a
[k
- 1] = (uint32_t)za
& 0x7FFFFFFF;
228 b
[k
- 1] = (uint32_t)zb
& 0x7FFFFFFF;
231 #define M ((uint64_t)1 << 32)
236 cca
= *(int64_t *)&tta
;
237 ccb
= *(int64_t *)&ttb
;
240 a
[len
- 1] = (uint32_t)cca
;
241 b
[len
- 1] = (uint32_t)ccb
;
247 * (this is a case of Montgomery reduction)
248 * The top word of 'a' and 'b' may have a 32-th bit set.
249 * We may have to add or subtract the modulus.
251 finish_mod(a
, len
, m
, (uint32_t)((uint64_t)cca
>> 63));
252 finish_mod(b
, len
, m
, (uint32_t)((uint64_t)ccb
>> 63));
257 br_i31_moddiv(uint32_t *x
, const uint32_t *y
, const uint32_t *m
, uint32_t m0i
,
261 * Algorithm is an extended binary GCD. We maintain four values
262 * a, b, u and v, with the following invariants:
264 * a * x = y * u mod m
265 * b * x = y * v mod m
267 * Starting values are:
274 * The formal definition of the algorithm is a sequence of steps:
276 * - If a is even, then a <- a/2 and u <- u/2 mod m.
277 * - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
278 * - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
279 * - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
281 * Algorithm stops when a = b. At that point, they both are equal
282 * to GCD(y,m); the modular division succeeds if that value is 1.
283 * The result of the modular division is then u (or v: both are
284 * equal at that point).
286 * Each step makes either a or b shrink by at least one bit; hence,
287 * if m has bit length k bits, then 2k-2 steps are sufficient.
290 * Though complexity is quadratic in the size of m, the bit-by-bit
291 * processing is not very efficient. We can speed up processing by
292 * remarking that the decisions are taken based only on observation
293 * of the top and low bits of a and b.
295 * In the loop below, at each iteration, we use the two top words
296 * of a and b, and the low words of a and b, to compute reduction
297 * parameters pa, pb, qa and qb such that the new values for a
300 * a' = (a*pa + b*pb) / (2^31)
301 * b' = (a*qa + b*qb) / (2^31)
303 * the division being exact.
305 * Since the choices are based on the top words, they may be slightly
306 * off, requiring an optional correction: if a' < 0, then we replace
307 * pa with -pa, and pb with -pb. The total length of a and b is
308 * thus reduced by at least 30 bits at each iteration.
310 * The stopping conditions are still the same, though: when a
311 * and b become equal, they must be both odd (since m is odd,
312 * the GCD cannot be even), therefore the next operation is a
313 * subtraction, and one of the values becomes 0. At that point,
314 * nothing else happens, i.e. one value is stuck at 0, and the
315 * other one is the GCD.
318 uint32_t *a
, *b
, *u
, *v
;
321 len
= (m
[0] + 31) >> 5;
326 memcpy(a
, y
+ 1, len
* sizeof *y
);
327 memcpy(b
, m
+ 1, len
* sizeof *m
);
328 memset(v
, 0, len
* sizeof *v
);
331 * Loop below ensures that a and b are reduced by some bits each,
332 * for a total of at least 30 bits.
334 for (num
= ((m
[0] - (m
[0] >> 5)) << 1) + 30; num
>= 30; num
-= 30) {
337 uint32_t a0
, a1
, b0
, b1
;
340 int64_t pa
, pb
, qa
, qb
;
344 * Extract top words of a and b. If j is the highest
345 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
346 * (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].
347 * If a and b are down to one word each, then we use a[0]
362 a0
^= (a0
^ aw
) & c0
;
363 a1
^= (a1
^ aw
) & c1
;
364 b0
^= (b0
^ bw
) & c0
;
365 b1
^= (b1
^ bw
) & c1
;
367 c0
&= (((aw
| bw
) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
371 * If c1 = 0, then we grabbed two words for a and b.
372 * If c1 != 0 but c0 = 0, then we grabbed one word. It
373 * is not possible that c1 != 0 and c0 != 0, because that
374 * would mean that both integers are zero.
380 a_hi
= ((uint64_t)a0
<< 31) + a1
;
381 b_hi
= ((uint64_t)b0
<< 31) + b1
;
386 * Compute reduction factors:
391 * such that a' and b' are both multiple of 2^31, but are
392 * only marginally larger than a and b.
398 for (i
= 0; i
< 31; i
++) {
402 * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
403 * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
404 * a <- a/2 if: a is even
405 * b <- b/2 if: a is odd, b is even
407 * We multiply a_lo and b_lo by 2 at each
408 * iteration, thus a division by 2 really is a
409 * non-multiplication by 2.
411 uint32_t r
, oa
, ob
, cAB
, cBA
, cA
;
416 * But the GT() function works on uint32_t operands,
417 * so we inline a 64-bit version here.
420 r
= (uint32_t)((rz
^ ((a_hi
^ b_hi
)
421 & (a_hi
^ rz
))) >> 63);
424 * cAB = 1 if b must be subtracted from a
425 * cBA = 1 if a must be subtracted from b
426 * cA = 1 if a is divided by 2, 0 otherwise
430 * cAB and cBA cannot be both 1.
431 * if a is not divided by 2, b is.
433 oa
= (a_lo
>> i
) & 1;
434 ob
= (b_lo
>> i
) & 1;
436 cBA
= oa
& ob
& NOT(r
);
440 * Conditional subtractions.
443 a_hi
-= b_hi
& -(uint64_t)cAB
;
444 pa
-= qa
& -(int64_t)cAB
;
445 pb
-= qb
& -(int64_t)cAB
;
447 b_hi
-= a_hi
& -(uint64_t)cBA
;
448 qa
-= pa
& -(int64_t)cBA
;
449 qb
-= pb
& -(int64_t)cBA
;
454 a_lo
+= a_lo
& (cA
- 1);
455 pa
+= pa
& ((int64_t)cA
- 1);
456 pb
+= pb
& ((int64_t)cA
- 1);
457 a_hi
^= (a_hi
^ (a_hi
>> 1)) & -(uint64_t)cA
;
459 qa
+= qa
& -(int64_t)cA
;
460 qb
+= qb
& -(int64_t)cA
;
461 b_hi
^= (b_hi
^ (b_hi
>> 1)) & ((uint64_t)cA
- 1);
465 * Replace a and b with new values a' and b'.
467 r
= co_reduce(a
, b
, len
, pa
, pb
, qa
, qb
);
468 pa
-= pa
* ((r
& 1) << 1);
469 pb
-= pb
* ((r
& 1) << 1);
472 co_reduce_mod(u
, v
, len
, pa
, pb
, qa
, qb
, m
+ 1, m0i
);
476 * Now one of the arrays should be 0, and the other contains
477 * the GCD. If a is 0, then u is 0 as well, and v contains
478 * the division result.
479 * Result is correct if and only if GCD is 1.
481 r
= (a
[0] | b
[0]) ^ 1;
483 for (k
= 1; k
< len
; k
++) {