_ Git - BearSSL/blob - int/i15_core.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * This file contains the core "big integer" functions for the i15
  29  * implementation, that represents integers as sequences of 15-bit
  30  * words.
  31  */
  32
  33 /* see inner.h */
  34 uint32_t
  35 br_i15_iszero(const uint16_t *x)
  36 {
  37         uint32_t z;
  38         size_t u;
  39
  40         z = 0;
  41         for (u = (x[0] + 15) >> 4; u > 0; u --) {
  42                 z |= x[u];
  43         }
  44         return ~(z | -z) >> 31;
  45 }
  46
  47 /* see inner.h */
  48 uint16_t
  49 br_i15_ninv15(uint16_t x)
  50 {
  51         uint32_t y;
  52
  53         y = 2 - x;
  54         y = MUL15(y, 2 - MUL15(x, y));
  55         y = MUL15(y, 2 - MUL15(x, y));
  56         y = MUL15(y, 2 - MUL15(x, y));
  57         return MUX(x & 1, -y, 0) & 0x7FFF;
  58 }
  59
  60 /* see inner.h */
  61 uint32_t
  62 br_i15_add(uint16_t *a, const uint16_t *b, uint32_t ctl)
  63 {
  64         uint32_t cc;
  65         size_t u, m;
  66
  67         cc = 0;
  68         m = (a[0] + 31) >> 4;
  69         for (u = 1; u < m; u ++) {
  70                 uint32_t aw, bw, naw;
  71
  72                 aw = a[u];
  73                 bw = b[u];
  74                 naw = aw + bw + cc;
  75                 cc = naw >> 15;
  76                 a[u] = MUX(ctl, naw & 0x7FFF, aw);
  77         }
  78         return cc;
  79 }
  80
  81 /* see inner.h */
  82 uint32_t
  83 br_i15_sub(uint16_t *a, const uint16_t *b, uint32_t ctl)
  84 {
  85         uint32_t cc;
  86         size_t u, m;
  87
  88         cc = 0;
  89         m = (a[0] + 31) >> 4;
  90         for (u = 1; u < m; u ++) {
  91                 uint32_t aw, bw, naw;
  92
  93                 aw = a[u];
  94                 bw = b[u];
  95                 naw = aw - bw - cc;
  96                 cc = naw >> 31;
  97                 a[u] = MUX(ctl, naw & 0x7FFF, aw);
  98         }
  99         return cc;
 100 }
 101
 102 /*
 103  * Constant-time division. The divisor must not be larger than 16 bits,
 104  * and the quotient must fit on 17 bits.
 105  */
 106 static uint32_t
 107 divrem16(uint32_t x, uint32_t d, uint32_t *r)
 108 {
 109         int i;
 110         uint32_t q;
 111
 112         q = 0;
 113         d <<= 16;
 114         for (i = 16; i >= 0; i --) {
 115                 uint32_t ctl;
 116
 117                 ctl = LE(d, x);
 118                 q |= ctl << i;
 119                 x -= (-ctl) & d;
 120                 d >>= 1;
 121         }
 122         if (r != NULL) {
 123                 *r = x;
 124         }
 125         return q;
 126 }
 127
 128 /* see inner.h */
 129 void
 130 br_i15_muladd_small(uint16_t *x, uint16_t z, const uint16_t *m)
 131 {
 132         /*
 133          * Constant-time: we accept to leak the exact bit length of the
 134          * modulus m.
 135          */
 136         unsigned m_bitlen, mblr;
 137         size_t u, mlen;
 138         uint32_t hi, a0, a, b, q;
 139         uint32_t cc, tb, over, under;
 140
 141         /*
 142          * Simple case: the modulus fits on one word.
 143          */
 144         m_bitlen = m[0];
 145         if (m_bitlen == 0) {
 146                 return;
 147         }
 148         if (m_bitlen <= 15) {
 149                 uint32_t rem;
 150
 151                 divrem16(((uint32_t)x[1] << 15) | z, m[1], &rem);
 152                 x[1] = rem;
 153                 return;
 154         }
 155         mlen = (m_bitlen + 15) >> 4;
 156         mblr = m_bitlen & 15;
 157
 158         /*
 159          * Principle: we estimate the quotient (x*2^15+z)/m by
 160          * doing a 30/15 division with the high words.
 161          *
 162          * Let:
 163          *   w = 2^15
 164          *   a = (w*a0 + a1) * w^N + a2
 165          *   b = b0 * w^N + b2
 166          * such that:
 167          *   0 <= a0 < w
 168          *   0 <= a1 < w
 169          *   0 <= a2 < w^N
 170          *   w/2 <= b0 < w
 171          *   0 <= b2 < w^N
 172          *   a < w*b
 173          * I.e. the two top words of a are a0:a1, the top word of b is
 174          * b0, we ensured that b0 is "full" (high bit set), and a is
 175          * such that the quotient q = a/b fits on one word (0 <= q < w).
 176          *
 177          * If a = b*q + r (with 0 <= r < q), then we can estimate q by
 178          * using a division on the top words:
 179          *   a0*w + a1 = b0*u + v (with 0 <= v < b0)
 180          * Then the following holds:
 181          *   0 <= u <= w
 182          *   u-2 <= q <= u
 183          */
 184         hi = x[mlen];
 185         if (mblr == 0) {
 186                 a0 = x[mlen];
 187                 memmove(x + 2, x + 1, (mlen - 1) * sizeof *x);
 188                 x[1] = z;
 189                 a = (a0 << 15) + x[mlen];
 190                 b = m[mlen];
 191         } else {
 192                 a0 = (x[mlen] << (15 - mblr)) | (x[mlen - 1] >> mblr);
 193                 memmove(x + 2, x + 1, (mlen - 1) * sizeof *x);
 194                 x[1] = z;
 195                 a = (a0 << 15) | (((x[mlen] << (15 - mblr))
 196                         | (x[mlen - 1] >> mblr)) & 0x7FFF);
 197                 b = (m[mlen] << (15 - mblr)) | (m[mlen - 1] >> mblr);
 198         }
 199         q = divrem16(a, b, NULL);
 200
 201         /*
 202          * We computed an estimate for q, but the real one may be q,
 203          * q-1 or q-2; moreover, the division may have returned a value
 204          * 8000 or even 8001 if the two high words were identical, and
 205          * we want to avoid values beyond 7FFF. We thus adjust q so
 206          * that the "true" multiplier will be q+1, q or q-1, and q is
 207          * in the 0000..7FFF range.
 208          */
 209         q = MUX(EQ(b, a0), 0x7FFF, q - 1 + ((q - 1) >> 31));
 210
 211         /*
 212          * We subtract q*m from x (x has an extra high word of value 'hi').
 213          * Since q may be off by 1 (in either direction), we may have to
 214          * add or subtract m afterwards.
 215          *
 216          * The 'tb' flag will be true (1) at the end of the loop if the
 217          * result is greater than or equal to the modulus (not counting
 218          * 'hi' or the carry).
 219          */
 220         cc = 0;
 221         tb = 1;
 222         for (u = 1; u <= mlen; u ++) {
 223                 uint32_t mw, zl, xw, nxw;
 224
 225                 mw = m[u];
 226                 zl = MUL15(mw, q) + cc;
 227                 cc = zl >> 15;
 228                 zl &= 0x7FFF;
 229                 xw = x[u];
 230                 nxw = xw - zl;
 231                 cc += nxw >> 31;
 232                 nxw &= 0x7FFF;
 233                 x[u] = nxw;
 234                 tb = MUX(EQ(nxw, mw), tb, GT(nxw, mw));
 235         }
 236
 237         /*
 238          * If we underestimated q, then either cc < hi (one extra bit
 239          * beyond the top array word), or cc == hi and tb is true (no
 240          * extra bit, but the result is not lower than the modulus).
 241          *
 242          * If we overestimated q, then cc > hi.
 243          */
 244         over = GT(cc, hi);
 245         under = ~over & (tb | LT(cc, hi));
 246         br_i15_add(x, m, over);
 247         br_i15_sub(x, m, under);
 248 }
 249
 250 /* see inner.h */
 251 void
 252 br_i15_montymul(uint16_t *d, const uint16_t *x, const uint16_t *y,
 253         const uint16_t *m, uint16_t m0i)
 254 {
 255         size_t len, len4, u, v;
 256         uint32_t dh;
 257
 258         len = (m[0] + 15) >> 4;
 259         len4 = len & ~(size_t)3;
 260         br_i15_zero(d, m[0]);
 261         dh = 0;
 262         for (u = 0; u < len; u ++) {
 263                 uint32_t f, xu, r, zh;
 264
 265                 xu = x[u + 1];
 266                 f = MUL15((d[1] + MUL15(x[u + 1], y[1])) & 0x7FFF, m0i)
 267                         & 0x7FFF;
 268
 269                 r = 0;
 270                 for (v = 0; v < len4; v += 4) {
 271                         uint32_t z;
 272
 273                         z = d[v + 1] + MUL15(xu, y[v + 1])
 274                                 + MUL15(f, m[v + 1]) + r;
 275                         r = z >> 15;
 276                         d[v + 0] = z & 0x7FFF;
 277                         z = d[v + 2] + MUL15(xu, y[v + 2])
 278                                 + MUL15(f, m[v + 2]) + r;
 279                         r = z >> 15;
 280                         d[v + 1] = z & 0x7FFF;
 281                         z = d[v + 3] + MUL15(xu, y[v + 3])
 282                                 + MUL15(f, m[v + 3]) + r;
 283                         r = z >> 15;
 284                         d[v + 2] = z & 0x7FFF;
 285                         z = d[v + 4] + MUL15(xu, y[v + 4])
 286                                 + MUL15(f, m[v + 4]) + r;
 287                         r = z >> 15;
 288                         d[v + 3] = z & 0x7FFF;
 289                 }
 290                 for (; v < len; v ++) {
 291                         uint32_t z;
 292
 293                         z = d[v + 1] + MUL15(xu, y[v + 1])
 294                                 + MUL15(f, m[v + 1]) + r;
 295                         r = z >> 15;
 296                         d[v + 0] = z & 0x7FFF;
 297                 }
 298
 299                 zh = dh + r;
 300                 d[len] = zh & 0x7FFF;
 301                 dh = zh >> 15;
 302         }
 303
 304         /*
 305          * Restore the bit length (it was overwritten in the loop above).
 306          */
 307         d[0] = m[0];
 308
 309         /*
 310          * d[] may be greater than m[], but it is still lower than twice
 311          * the modulus.
 312          */
 313         br_i15_sub(d, m, NEQ(dh, 0) | NOT(br_i15_sub(d, m, 0)));
 314 }
 315
 316 /* see inner.h */
 317 void
 318 br_i15_to_monty(uint16_t *x, const uint16_t *m)
 319 {
 320         unsigned k;
 321
 322         for (k = (m[0] + 15) >> 4; k > 0; k --) {
 323                 br_i15_muladd_small(x, 0, m);
 324         }
 325 }
 326
 327 /* see inner.h */
 328 void
 329 br_i15_modpow(uint16_t *x,
 330         const unsigned char *e, size_t elen,
 331         const uint16_t *m, uint16_t m0i, uint16_t *t1, uint16_t *t2)
 332 {
 333         size_t mlen;
 334         unsigned k;
 335
 336         mlen = ((m[0] + 31) >> 4) * sizeof m[0];
 337         memcpy(t1, x, mlen);
 338         br_i15_to_monty(t1, m);
 339         br_i15_zero(x, m[0]);
 340         x[1] = 1;
 341         for (k = 0; k < ((unsigned)elen << 3); k ++) {
 342                 uint32_t ctl;
 343
 344                 ctl = (e[elen - 1 - (k >> 3)] >> (k & 7)) & 1;
 345                 br_i15_montymul(t2, x, t1, m, m0i);
 346                 CCOPY(ctl, x, t2, mlen);
 347                 br_i15_montymul(t2, t1, t1, m, m0i);
 348                 memcpy(t1, t2, mlen);
 349         }
 350 }
 351
 352 /* see inner.h */
 353 void
 354 br_i15_encode(void *dst, size_t len, const uint16_t *x)
 355 {
 356         unsigned char *buf;
 357         size_t u, xlen;
 358         uint32_t acc;
 359         int acc_len;
 360
 361         xlen = (x[0] + 15) >> 4;
 362         if (xlen == 0) {
 363                 memset(dst, 0, len);
 364                 return;
 365         }
 366         u = 1;
 367         acc = 0;
 368         acc_len = 0;
 369         buf = dst;
 370         while (len -- > 0) {
 371                 if (acc_len < 8) {
 372                         if (u <= xlen) {
 373                                 acc += (uint32_t)x[u ++] << acc_len;
 374                         }
 375                         acc_len += 15;
 376                 }
 377                 buf[len] = (unsigned char)acc;
 378                 acc >>= 8;
 379                 acc_len -= 8;
 380         }
 381 }
 382
 383 /* see inner.h */
 384 uint32_t
 385 br_i15_decode_mod(uint16_t *x, const void *src, size_t len, const uint16_t *m)
 386 {
 387         /*
 388          * Two-pass algorithm: in the first pass, we determine whether the
 389          * value fits; in the second pass, we do the actual write.
 390          *
 391          * During the first pass, 'r' contains the comparison result so
 392          * far:
 393          *  0x00000000   value is equal to the modulus
 394          *  0x00000001   value is greater than the modulus
 395          *  0xFFFFFFFF   value is lower than the modulus
 396          *
 397          * Since we iterate starting with the least significant bytes (at
 398          * the end of src[]), each new comparison overrides the previous
 399          * except when the comparison yields 0 (equal).
 400          *
 401          * During the second pass, 'r' is either 0xFFFFFFFF (value fits)
 402          * or 0x00000000 (value does not fit).
 403          *
 404          * We must iterate over all bytes of the source, _and_ possibly
 405          * some extra virutal bytes (with value 0) so as to cover the
 406          * complete modulus as well. We also add 4 such extra bytes beyond
 407          * the modulus length because it then guarantees that no accumulated
 408          * partial word remains to be processed.
 409          */
 410         const unsigned char *buf;
 411         size_t mlen, tlen;
 412         int pass;
 413         uint32_t r;
 414
 415         buf = src;
 416         mlen = (m[0] + 15) >> 4;
 417         tlen = (mlen << 1);
 418         if (tlen < len) {
 419                 tlen = len;
 420         }
 421         tlen += 4;
 422         r = 0;
 423         for (pass = 0; pass < 2; pass ++) {
 424                 size_t u, v;
 425                 uint32_t acc;
 426                 int acc_len;
 427
 428                 v = 1;
 429                 acc = 0;
 430                 acc_len = 0;
 431                 for (u = 0; u < tlen; u ++) {
 432                         uint32_t b;
 433
 434                         if (u < len) {
 435                                 b = buf[len - 1 - u];
 436                         } else {
 437                                 b = 0;
 438                         }
 439                         acc |= (b << acc_len);
 440                         acc_len += 8;
 441                         if (acc_len >= 15) {
 442                                 uint32_t xw;
 443
 444                                 xw = acc & (uint32_t)0x7FFF;
 445                                 acc_len -= 15;
 446                                 acc = b >> (8 - acc_len);
 447                                 if (v <= mlen) {
 448                                         if (pass) {
 449                                                 x[v] = r & xw;
 450                                         } else {
 451                                                 uint32_t cc;
 452
 453                                                 cc = (uint32_t)CMP(xw, m[v]);
 454                                                 r = MUX(EQ(cc, 0), r, cc);
 455                                         }
 456                                 } else {
 457                                         if (!pass) {
 458                                                 r = MUX(EQ(xw, 0), r, 1);
 459                                         }
 460                                 }
 461                                 v ++;
 462                         }
 463                 }
 464
 465                 /*
 466                  * When we reach this point at the end of the first pass:
 467                  * r is either 0, 1 or -1; we want to set r to 0 if it
 468                  * is equal to 0 or 1, and leave it to -1 otherwise.
 469                  *
 470                  * When we reach this point at the end of the second pass:
 471                  * r is either 0 or -1; we want to leave that value
 472                  * untouched. This is a subcase of the previous.
 473                  */
 474                 r >>= 1;
 475                 r |= (r << 1);
 476         }
 477
 478         x[0] = m[0];
 479         return r & (uint32_t)1;
 480 }