_ Git - BearSSL/blob - src/inner.h

   1 /*
   2  * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #ifndef INNER_H__
  26 #define INNER_H__
  27
  28 #include <string.h>
  29 #include <limits.h>
  30
  31 #include "config.h"
  32 #include "bearssl.h"
  33
  34 /*
  35  * On MSVC, disable the warning about applying unary minus on an
  36  * unsigned type: it is standard, we do it all the time, and for
  37  * good reasons.
  38  */
  39 #if _MSC_VER
  40 #pragma warning( disable : 4146 )
  41 #endif
  42
  43 /*
  44  * Maximum size for a RSA modulus (in bits). Allocated stack buffers
  45  * depend on that size, so this value should be kept small. Currently,
  46  * 2048-bit RSA keys offer adequate security, and should still do so for
  47  * the next few decades; however, a number of widespread PKI have
  48  * already set their root keys to RSA-4096, so we should be able to
  49  * process such keys.
  50  *
  51  * This value MUST be a multiple of 64.
  52  */
  53 #define BR_MAX_RSA_SIZE   4096
  54
  55 /*
  56  * Maximum size for a RSA factor (in bits). This is for RSA private-key
  57  * operations. Default is to support factors up to a bit more than half
  58  * the maximum modulus size.
  59  *
  60  * This value MUST be a multiple of 32.
  61  */
  62 #define BR_MAX_RSA_FACTOR   ((BR_MAX_RSA_SIZE + 64) >> 1)
  63
  64 /*
  65  * Maximum size for an EC curve (modulus or order), in bits. Size of
  66  * stack buffers depends on that parameter. This size MUST be a multiple
  67  * of 8 (so that decoding an integer with that many bytes does not
  68  * overflow).
  69  */
  70 #define BR_MAX_EC_SIZE   528
  71
  72 /*
  73  * Some macros to recognize the current architecture. Right now, we are
  74  * interested into automatically recognizing architecture with efficient
  75  * 64-bit types so that we may automatically use implementations that
  76  * use 64-bit registers in that case. Future versions may detect, e.g.,
  77  * availability of SSE2 intrinsics.
  78  *
  79  * If 'unsigned long' is a 64-bit type, then we assume that 64-bit types
  80  * are efficient. Otherwise, we rely on macros that depend on compiler,
  81  * OS and architecture. In any case, failure to detect the architecture
  82  * as 64-bit means that the 32-bit code will be used, and that code
  83  * works also on 64-bit architectures (the 64-bit code may simply be
  84  * more efficient).
  85  *
  86  * The test on 'unsigned long' should already catch most cases, the one
  87  * notable exception being Windows code where 'unsigned long' is kept to
  88  * 32-bit for compatbility with all the legacy code that liberally uses
  89  * the 'DWORD' type for 32-bit values.
  90  *
  91  * Macro names are taken from: http://nadeausoftware.com/articles/2012/02/c_c_tip_how_detect_processor_type_using_compiler_predefined_macros
  92  */
  93 #ifndef BR_64
  94 #if ((ULONG_MAX >> 31) >> 31) == 3
  95 #define BR_64   1
  96 #elif defined(__ia64) || defined(__itanium__) || defined(_M_IA64)
  97 #define BR_64   1
  98 #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
  99         || defined(__64BIT__) || defined(_LP64) || defined(__LP64__)
 100 #define BR_64   1
 101 #elif defined(__sparc64__)
 102 #define BR_64   1
 103 #elif defined(__x86_64__) || defined(_M_X64)
 104 #define BR_64   1
 105 #endif
 106 #endif
 107
 108 /*
 109  * Set BR_LOMUL on platforms where it makes sense.
 110  */
 111 #ifndef BR_LOMUL
 112 #if BR_ARMEL_CORTEXM_GCC
 113 #define BR_LOMUL   1
 114 #endif
 115 #endif
 116
 117 /*
 118  * Architecture detection.
 119  */
 120 #ifndef BR_i386
 121 #if __i386__ || _M_IX86
 122 #define BR_i386   1
 123 #endif
 124 #endif
 125
 126 #ifndef BR_amd64
 127 #if __x86_64__ || _M_X64
 128 #define BR_amd64   1
 129 #endif
 130 #endif
 131
 132 /*
 133  * Compiler brand and version.
 134  *
 135  * Implementations that use intrinsics need to detect the compiler type
 136  * and version because some specific actions may be needed to activate
 137  * the corresponding opcodes, both for header inclusion, and when using
 138  * them in a function.
 139  *
 140  * BR_GCC, BR_CLANG and BR_MSC will be set to 1 for, respectively, GCC,
 141  * Clang and MS Visual C. For each of them, sub-macros will be defined
 142  * for versions; each sub-macro is set whenever the compiler version is
 143  * at least as recent as the one corresponding to the macro.
 144  */
 145
 146 /*
 147  * GCC thresholds are on versions 4.4 to 4.9 and 5.0.
 148  */
 149 #ifndef BR_GCC
 150 #if __GNUC__ && !__clang__
 151 #define BR_GCC   1
 152
 153 #if __GNUC__ > 4
 154 #define BR_GCC_5_0   1
 155 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
 156 #define BR_GCC_4_9   1
 157 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 8
 158 #define BR_GCC_4_8   1
 159 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 7
 160 #define BR_GCC_4_7   1
 161 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 6
 162 #define BR_GCC_4_6   1
 163 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 5
 164 #define BR_GCC_4_5   1
 165 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 4
 166 #define BR_GCC_4_4   1
 167 #endif
 168
 169 #if BR_GCC_5_0
 170 #define BR_GCC_4_9   1
 171 #endif
 172 #if BR_GCC_4_9
 173 #define BR_GCC_4_8   1
 174 #endif
 175 #if BR_GCC_4_8
 176 #define BR_GCC_4_7   1
 177 #endif
 178 #if BR_GCC_4_7
 179 #define BR_GCC_4_6   1
 180 #endif
 181 #if BR_GCC_4_6
 182 #define BR_GCC_4_5   1
 183 #endif
 184 #if BR_GCC_4_5
 185 #define BR_GCC_4_4   1
 186 #endif
 187
 188 #endif
 189 #endif
 190
 191 /*
 192  * Clang thresholds are on versions 3.7.0 and 3.8.0.
 193  */
 194 #ifndef BR_CLANG
 195 #if __clang__
 196 #define BR_CLANG   1
 197
 198 #if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
 199 #define BR_CLANG_3_8   1
 200 #elif __clang_major__ == 3 && __clang_minor__ >= 7
 201 #define BR_CLANG_3_7   1
 202 #endif
 203
 204 #if BR_CLANG_3_8
 205 #define BR_CLANG_3_7   1
 206 #endif
 207
 208 #endif
 209 #endif
 210
 211 /*
 212  * MS Visual C thresholds are on Visual Studio 2005 to 2015.
 213  */
 214 #ifndef BR_MSC
 215 #if _MSC_VER
 216 #define BR_MSC   1
 217
 218 #if _MSC_VER >= 1900
 219 #define BR_MSC_2015   1
 220 #elif _MSC_VER >= 1800
 221 #define BR_MSC_2013   1
 222 #elif _MSC_VER >= 1700
 223 #define BR_MSC_2012   1
 224 #elif _MSC_VER >= 1600
 225 #define BR_MSC_2010   1
 226 #elif _MSC_VER >= 1500
 227 #define BR_MSC_2008   1
 228 #elif _MSC_VER >= 1400
 229 #define BR_MSC_2005   1
 230 #endif
 231
 232 #if BR_MSC_2015
 233 #define BR_MSC_2013   1
 234 #endif
 235 #if BR_MSC_2013
 236 #define BR_MSC_2012   1
 237 #endif
 238 #if BR_MSC_2012
 239 #define BR_MSC_2010   1
 240 #endif
 241 #if BR_MSC_2010
 242 #define BR_MSC_2008   1
 243 #endif
 244 #if BR_MSC_2008
 245 #define BR_MSC_2005   1
 246 #endif
 247
 248 #endif
 249 #endif
 250
 251 /*
 252  * GCC 4.4+ and Clang 3.7+ allow tagging specific functions with a
 253  * 'target' attribute that activates support for specific opcodes.
 254  */
 255 #if BR_GCC_4_4 || BR_CLANG_3_7
 256 #define BR_TARGET(x)   __attribute__((target(x)))
 257 #else
 258 #define BR_TARGET(x)
 259 #endif
 260
 261 /*
 262  * AES-NI intrinsics are available on x86 (32-bit and 64-bit) with
 263  * GCC 4.8+, Clang 3.7+ and MSC 2012+.
 264  */
 265 #ifndef BR_AES_X86NI
 266 #if (BR_i386 || BR_amd64) && (BR_GCC_4_8 || BR_CLANG_3_7 || BR_MSC_2012)
 267 #define BR_AES_X86NI   1
 268 #endif
 269 #endif
 270
 271 /*
 272  * SSE2 intrinsics are available on x86 (32-bit and 64-bit) with
 273  * GCC 4.4+, Clang 3.7+ and MSC 2005+.
 274  */
 275 #ifndef BR_SSE2
 276 #if (BR_i386 || BR_amd64) && (BR_GCC_4_4 || BR_CLANG_3_7 || BR_MSC_2005)
 277 #define BR_SSE2   1
 278 #endif
 279 #endif
 280
 281 /*
 282  * RDRAND intrinsics are available on x86 (32-bit and 64-bit) with
 283  * GCC 4.6+, Clang 3.7+ and MSC 2012+.
 284  */
 285 #ifndef BR_RDRAND
 286 #if (BR_i386 || BR_amd64) && (BR_GCC_4_6 || BR_CLANG_3_7 || BR_MSC_2012)
 287 #define BR_RDRAND   1
 288 #endif
 289 #endif
 290
 291 /*
 292  * Determine type of OS for random number generation. Macro names and
 293  * values are documented on:
 294  *    https://sourceforge.net/p/predef/wiki/OperatingSystems/
 295  *
 296  * TODO: enrich the list of detected system. Also add detection for
 297  * alternate system calls like getentropy(), which are usually
 298  * preferable when available.
 299  */
 300
 301 #ifndef BR_USE_URANDOM
 302 #if defined _AIX \
 303         || defined __ANDROID__ \
 304         || defined __FreeBSD__ \
 305         || defined __NetBSD__ \
 306         || defined __OpenBSD__ \
 307         || defined __DragonFly__ \
 308         || defined __linux__ \
 309         || (defined __sun && (defined __SVR4 || defined __svr4__)) \
 310         || (defined __APPLE__ && defined __MACH__)
 311 #define BR_USE_URANDOM   1
 312 #endif
 313 #endif
 314
 315 #ifndef BR_USE_WIN32_RAND
 316 #if defined _WIN32 || defined _WIN64
 317 #define BR_USE_WIN32_RAND   1
 318 #endif
 319 #endif
 320
 321 /*
 322  * POWER8 crypto support. We rely on compiler macros for the
 323  * architecture, since we do not have a reliable, simple way to detect
 324  * the required support at runtime (we could try running an opcode, and
 325  * trapping the exception or signal on illegal instruction, but this
 326  * induces some non-trivial OS dependencies that we would prefer to
 327  * avoid if possible).
 328  */
 329 #ifndef BR_POWER8
 330 #if __GNUC__ && ((_ARCH_PWR8 || _ARCH_PPC) && __CRYPTO__)
 331 #define BR_POWER8   1
 332 #endif
 333 #endif
 334
 335 /*
 336  * Detect endinanness on POWER8.
 337  */
 338 #if BR_POWER8
 339 #if defined BR_POWER8_LE
 340 #undef BR_POWER8_BE
 341 #if BR_POWER8_LE
 342 #define BR_POWER8_BE   0
 343 #else
 344 #define BR_POWER8_BE   1
 345 #endif
 346 #elif defined BR_POWER8_BE
 347 #undef BR_POWER8_LE
 348 #if BR_POWER8_BE
 349 #define BR_POWER8_LE   0
 350 #else
 351 #define BR_POWER8_LE   1
 352 #endif
 353 #else
 354 #if __LITTLE_ENDIAN__
 355 #define BR_POWER8_LE   1
 356 #define BR_POWER8_BE   0
 357 #else
 358 #define BR_POWER8_LE   0
 359 #define BR_POWER8_BE   1
 360 #endif
 361 #endif
 362 #endif
 363
 364 /*
 365  * Detect support for 128-bit integers.
 366  */
 367 #if !defined BR_INT128 && !defined BR_UMUL128
 368 #ifdef __SIZEOF_INT128__
 369 #define BR_INT128    1
 370 #elif _M_X64
 371 #define BR_UMUL128   1
 372 #endif
 373 #endif
 374
 375 /*
 376  * Detect support for unaligned accesses with known endianness.
 377  *
 378  *  x86 (both 32-bit and 64-bit) is little-endian and allows unaligned
 379  *  accesses.
 380  *
 381  *  POWER/PowerPC allows unaligned accesses when big-endian. POWER8 and
 382  *  later also allow unaligned accesses when little-endian.
 383  */
 384 #if !defined BR_LE_UNALIGNED && !defined BR_BE_UNALIGNED
 385
 386 #if __i386 || __i386__ || __x86_64__ || _M_IX86 || _M_X64
 387 #define BR_LE_UNALIGNED   1
 388 #elif BR_POWER8_BE
 389 #define BR_BE_UNALIGNED   1
 390 #elif BR_POWER8_LE
 391 #define BR_LE_UNALIGNED   1
 392 #elif (__powerpc__ || __powerpc64__ || _M_PPC || _ARCH_PPC || _ARCH_PPC64) \
 393         && __BIG_ENDIAN__
 394 #define BR_BE_UNALIGNED   1
 395 #endif
 396
 397 #endif
 398
 399 /*
 400  * Detect support for an OS-provided time source.
 401  */
 402
 403 #ifndef BR_USE_UNIX_TIME
 404 #if defined __unix__ || defined __linux__ \
 405         || defined _POSIX_SOURCE || defined _POSIX_C_SOURCE \
 406         || (defined __APPLE__ && defined __MACH__)
 407 #define BR_USE_UNIX_TIME   1
 408 #endif
 409 #endif
 410
 411 #ifndef BR_USE_WIN32_TIME
 412 #if defined _WIN32 || defined _WIN64
 413 #define BR_USE_WIN32_TIME   1
 414 #endif
 415 #endif
 416
 417 /* ==================================================================== */
 418 /*
 419  * Encoding/decoding functions.
 420  *
 421  * 32-bit and 64-bit decoding, both little-endian and big-endian, is
 422  * implemented with the inline functions below.
 423  *
 424  * When allowed by some compile-time options (autodetected or provided),
 425  * optimised code is used, to perform direct memory access when the
 426  * underlying architecture supports it, both for endianness and
 427  * alignment. This, however, may trigger strict aliasing issues; the
 428  * code below uses unions to perform (supposedly) safe type punning.
 429  * Since the C aliasing rules are relatively complex and were amended,
 430  * or at least re-explained with different phrasing, in all successive
 431  * versions of the C standard, it is always a bit risky to bet that any
 432  * specific version of a C compiler got it right, for some notion of
 433  * "right".
 434  */
 435
 436 typedef union {
 437         uint16_t u;
 438         unsigned char b[sizeof(uint16_t)];
 439 } br_union_u16;
 440
 441 typedef union {
 442         uint32_t u;
 443         unsigned char b[sizeof(uint32_t)];
 444 } br_union_u32;
 445
 446 typedef union {
 447         uint64_t u;
 448         unsigned char b[sizeof(uint64_t)];
 449 } br_union_u64;
 450
 451 static inline void
 452 br_enc16le(void *dst, unsigned x)
 453 {
 454 #if BR_LE_UNALIGNED
 455         ((br_union_u16 *)dst)->u = x;
 456 #else
 457         unsigned char *buf;
 458
 459         buf = dst;
 460         buf[0] = (unsigned char)x;
 461         buf[1] = (unsigned char)(x >> 8);
 462 #endif
 463 }
 464
 465 static inline void
 466 br_enc16be(void *dst, unsigned x)
 467 {
 468 #if BR_BE_UNALIGNED
 469         ((br_union_u16 *)dst)->u = x;
 470 #else
 471         unsigned char *buf;
 472
 473         buf = dst;
 474         buf[0] = (unsigned char)(x >> 8);
 475         buf[1] = (unsigned char)x;
 476 #endif
 477 }
 478
 479 static inline unsigned
 480 br_dec16le(const void *src)
 481 {
 482 #if BR_LE_UNALIGNED
 483         return ((const br_union_u16 *)src)->u;
 484 #else
 485         const unsigned char *buf;
 486
 487         buf = src;
 488         return (unsigned)buf[0] | ((unsigned)buf[1] << 8);
 489 #endif
 490 }
 491
 492 static inline unsigned
 493 br_dec16be(const void *src)
 494 {
 495 #if BR_BE_UNALIGNED
 496         return ((const br_union_u16 *)src)->u;
 497 #else
 498         const unsigned char *buf;
 499
 500         buf = src;
 501         return ((unsigned)buf[0] << 8) | (unsigned)buf[1];
 502 #endif
 503 }
 504
 505 static inline void
 506 br_enc32le(void *dst, uint32_t x)
 507 {
 508 #if BR_LE_UNALIGNED
 509         ((br_union_u32 *)dst)->u = x;
 510 #else
 511         unsigned char *buf;
 512
 513         buf = dst;
 514         buf[0] = (unsigned char)x;
 515         buf[1] = (unsigned char)(x >> 8);
 516         buf[2] = (unsigned char)(x >> 16);
 517         buf[3] = (unsigned char)(x >> 24);
 518 #endif
 519 }
 520
 521 static inline void
 522 br_enc32be(void *dst, uint32_t x)
 523 {
 524 #if BR_BE_UNALIGNED
 525         ((br_union_u32 *)dst)->u = x;
 526 #else
 527         unsigned char *buf;
 528
 529         buf = dst;
 530         buf[0] = (unsigned char)(x >> 24);
 531         buf[1] = (unsigned char)(x >> 16);
 532         buf[2] = (unsigned char)(x >> 8);
 533         buf[3] = (unsigned char)x;
 534 #endif
 535 }
 536
 537 static inline uint32_t
 538 br_dec32le(const void *src)
 539 {
 540 #if BR_LE_UNALIGNED
 541         return ((const br_union_u32 *)src)->u;
 542 #else
 543         const unsigned char *buf;
 544
 545         buf = src;
 546         return (uint32_t)buf[0]
 547                 | ((uint32_t)buf[1] << 8)
 548                 | ((uint32_t)buf[2] << 16)
 549                 | ((uint32_t)buf[3] << 24);
 550 #endif
 551 }
 552
 553 static inline uint32_t
 554 br_dec32be(const void *src)
 555 {
 556 #if BR_BE_UNALIGNED
 557         return ((const br_union_u32 *)src)->u;
 558 #else
 559         const unsigned char *buf;
 560
 561         buf = src;
 562         return ((uint32_t)buf[0] << 24)
 563                 | ((uint32_t)buf[1] << 16)
 564                 | ((uint32_t)buf[2] << 8)
 565                 | (uint32_t)buf[3];
 566 #endif
 567 }
 568
 569 static inline void
 570 br_enc64le(void *dst, uint64_t x)
 571 {
 572 #if BR_LE_UNALIGNED
 573         ((br_union_u64 *)dst)->u = x;
 574 #else
 575         unsigned char *buf;
 576
 577         buf = dst;
 578         br_enc32le(buf, (uint32_t)x);
 579         br_enc32le(buf + 4, (uint32_t)(x >> 32));
 580 #endif
 581 }
 582
 583 static inline void
 584 br_enc64be(void *dst, uint64_t x)
 585 {
 586 #if BR_BE_UNALIGNED
 587         ((br_union_u64 *)dst)->u = x;
 588 #else
 589         unsigned char *buf;
 590
 591         buf = dst;
 592         br_enc32be(buf, (uint32_t)(x >> 32));
 593         br_enc32be(buf + 4, (uint32_t)x);
 594 #endif
 595 }
 596
 597 static inline uint64_t
 598 br_dec64le(const void *src)
 599 {
 600 #if BR_LE_UNALIGNED
 601         return ((const br_union_u64 *)src)->u;
 602 #else
 603         const unsigned char *buf;
 604
 605         buf = src;
 606         return (uint64_t)br_dec32le(buf)
 607                 | ((uint64_t)br_dec32le(buf + 4) << 32);
 608 #endif
 609 }
 610
 611 static inline uint64_t
 612 br_dec64be(const void *src)
 613 {
 614 #if BR_BE_UNALIGNED
 615         return ((const br_union_u64 *)src)->u;
 616 #else
 617         const unsigned char *buf;
 618
 619         buf = src;
 620         return ((uint64_t)br_dec32be(buf) << 32)
 621                 | (uint64_t)br_dec32be(buf + 4);
 622 #endif
 623 }
 624
 625 /*
 626  * Range decoding and encoding (for several successive values).
 627  */
 628 void br_range_dec16le(uint16_t *v, size_t num, const void *src);
 629 void br_range_dec16be(uint16_t *v, size_t num, const void *src);
 630 void br_range_enc16le(void *dst, const uint16_t *v, size_t num);
 631 void br_range_enc16be(void *dst, const uint16_t *v, size_t num);
 632
 633 void br_range_dec32le(uint32_t *v, size_t num, const void *src);
 634 void br_range_dec32be(uint32_t *v, size_t num, const void *src);
 635 void br_range_enc32le(void *dst, const uint32_t *v, size_t num);
 636 void br_range_enc32be(void *dst, const uint32_t *v, size_t num);
 637
 638 void br_range_dec64le(uint64_t *v, size_t num, const void *src);
 639 void br_range_dec64be(uint64_t *v, size_t num, const void *src);
 640 void br_range_enc64le(void *dst, const uint64_t *v, size_t num);
 641 void br_range_enc64be(void *dst, const uint64_t *v, size_t num);
 642
 643 /*
 644  * Byte-swap a 32-bit integer.
 645  */
 646 static inline uint32_t
 647 br_swap32(uint32_t x)
 648 {
 649         x = ((x & (uint32_t)0x00FF00FF) << 8)
 650                 | ((x >> 8) & (uint32_t)0x00FF00FF);
 651         return (x << 16) | (x >> 16);
 652 }
 653
 654 /* ==================================================================== */
 655 /*
 656  * Support code for hash functions.
 657  */
 658
 659 /*
 660  * IV for MD5, SHA-1, SHA-224 and SHA-256.
 661  */
 662 extern const uint32_t br_md5_IV[];
 663 extern const uint32_t br_sha1_IV[];
 664 extern const uint32_t br_sha224_IV[];
 665 extern const uint32_t br_sha256_IV[];
 666
 667 /*
 668  * Round functions for MD5, SHA-1, SHA-224 and SHA-256 (SHA-224 and
 669  * SHA-256 use the same round function).
 670  */
 671 void br_md5_round(const unsigned char *buf, uint32_t *val);
 672 void br_sha1_round(const unsigned char *buf, uint32_t *val);
 673 void br_sha2small_round(const unsigned char *buf, uint32_t *val);
 674
 675 /*
 676  * The core function for the TLS PRF. It computes
 677  * P_hash(secret, label + seed), and XORs the result into the dst buffer.
 678  */
 679 void br_tls_phash(void *dst, size_t len,
 680         const br_hash_class *dig,
 681         const void *secret, size_t secret_len, const char *label,
 682         size_t seed_num, const br_tls_prf_seed_chunk *seed);
 683
 684 /*
 685  * Copy all configured hash implementations from a multihash context
 686  * to another.
 687  */
 688 static inline void
 689 br_multihash_copyimpl(br_multihash_context *dst,
 690         const br_multihash_context *src)
 691 {
 692         memcpy((void *)dst->impl, src->impl, sizeof src->impl);
 693 }
 694
 695 /* ==================================================================== */
 696 /*
 697  * Constant-time primitives. These functions manipulate 32-bit values in
 698  * order to provide constant-time comparisons and multiplexers.
 699  *
 700  * Boolean values (the "ctl" bits) MUST have value 0 or 1.
 701  *
 702  * Implementation notes:
 703  * =====================
 704  *
 705  * The uintN_t types are unsigned and with width exactly N bits; the C
 706  * standard guarantees that computations are performed modulo 2^N, and
 707  * there can be no overflow. Negation (unary '-') works on unsigned types
 708  * as well.
 709  *
 710  * The intN_t types are guaranteed to have width exactly N bits, with no
 711  * padding bit, and using two's complement representation. Casting
 712  * intN_t to uintN_t really is conversion modulo 2^N. Beware that intN_t
 713  * types, being signed, trigger implementation-defined behaviour on
 714  * overflow (including raising some signal): with GCC, while modular
 715  * arithmetics are usually applied, the optimizer may assume that
 716  * overflows don't occur (unless the -fwrapv command-line option is
 717  * added); Clang has the additional -ftrapv option to explicitly trap on
 718  * integer overflow or underflow.
 719  */
 720
 721 /*
 722  * Negate a boolean.
 723  */
 724 static inline uint32_t
 725 NOT(uint32_t ctl)
 726 {
 727         return ctl ^ 1;
 728 }
 729
 730 /*
 731  * Multiplexer: returns x if ctl == 1, y if ctl == 0.
 732  */
 733 static inline uint32_t
 734 MUX(uint32_t ctl, uint32_t x, uint32_t y)
 735 {
 736         return y ^ (-ctl & (x ^ y));
 737 }
 738
 739 /*
 740  * Equality check: returns 1 if x == y, 0 otherwise.
 741  */
 742 static inline uint32_t
 743 EQ(uint32_t x, uint32_t y)
 744 {
 745         uint32_t q;
 746
 747         q = x ^ y;
 748         return NOT((q | -q) >> 31);
 749 }
 750
 751 /*
 752  * Inequality check: returns 1 if x != y, 0 otherwise.
 753  */
 754 static inline uint32_t
 755 NEQ(uint32_t x, uint32_t y)
 756 {
 757         uint32_t q;
 758
 759         q = x ^ y;
 760         return (q | -q) >> 31;
 761 }
 762
 763 /*
 764  * Comparison: returns 1 if x > y, 0 otherwise.
 765  */
 766 static inline uint32_t
 767 GT(uint32_t x, uint32_t y)
 768 {
 769         /*
 770          * If both x < 2^31 and x < 2^31, then y-x will have its high
 771          * bit set if x > y, cleared otherwise.
 772          *
 773          * If either x >= 2^31 or y >= 2^31 (but not both), then the
 774          * result is the high bit of x.
 775          *
 776          * If both x >= 2^31 and y >= 2^31, then we can virtually
 777          * subtract 2^31 from both, and we are back to the first case.
 778          * Since (y-2^31)-(x-2^31) = y-x, the subtraction is already
 779          * fine.
 780          */
 781         uint32_t z;
 782
 783         z = y - x;
 784         return (z ^ ((x ^ y) & (x ^ z))) >> 31;
 785 }
 786
 787 /*
 788  * Other comparisons (greater-or-equal, lower-than, lower-or-equal).
 789  */
 790 #define GE(x, y)   NOT(GT(y, x))
 791 #define LT(x, y)   GT(y, x)
 792 #define LE(x, y)   NOT(GT(x, y))
 793
 794 /*
 795  * General comparison: returned value is -1, 0 or 1, depending on
 796  * whether x is lower than, equal to, or greater than y.
 797  */
 798 static inline int32_t
 799 CMP(uint32_t x, uint32_t y)
 800 {
 801         return (int32_t)GT(x, y) | -(int32_t)GT(y, x);
 802 }
 803
 804 /*
 805  * Returns 1 if x == 0, 0 otherwise. Take care that the operand is signed.
 806  */
 807 static inline uint32_t
 808 EQ0(int32_t x)
 809 {
 810         uint32_t q;
 811
 812         q = (uint32_t)x;
 813         return ~(q | -q) >> 31;
 814 }
 815
 816 /*
 817  * Returns 1 if x > 0, 0 otherwise. Take care that the operand is signed.
 818  */
 819 static inline uint32_t
 820 GT0(int32_t x)
 821 {
 822         /*
 823          * High bit of -x is 0 if x == 0, but 1 if x > 0.
 824          */
 825         uint32_t q;
 826
 827         q = (uint32_t)x;
 828         return (~q & -q) >> 31;
 829 }
 830
 831 /*
 832  * Returns 1 if x >= 0, 0 otherwise. Take care that the operand is signed.
 833  */
 834 static inline uint32_t
 835 GE0(int32_t x)
 836 {
 837         return ~(uint32_t)x >> 31;
 838 }
 839
 840 /*
 841  * Returns 1 if x < 0, 0 otherwise. Take care that the operand is signed.
 842  */
 843 static inline uint32_t
 844 LT0(int32_t x)
 845 {
 846         return (uint32_t)x >> 31;
 847 }
 848
 849 /*
 850  * Returns 1 if x <= 0, 0 otherwise. Take care that the operand is signed.
 851  */
 852 static inline uint32_t
 853 LE0(int32_t x)
 854 {
 855         uint32_t q;
 856
 857         /*
 858          * ~-x has its high bit set if and only if -x is nonnegative (as
 859          * a signed int), i.e. x is in the -(2^31-1) to 0 range. We must
 860          * do an OR with x itself to account for x = -2^31.
 861          */
 862         q = (uint32_t)x;
 863         return (q | ~-q) >> 31;
 864 }
 865
 866 /*
 867  * Conditional copy: src[] is copied into dst[] if and only if ctl is 1.
 868  * dst[] and src[] may overlap completely (but not partially).
 869  */
 870 void br_ccopy(uint32_t ctl, void *dst, const void *src, size_t len);
 871
 872 #define CCOPY   br_ccopy
 873
 874 /*
 875  * Compute the bit length of a 32-bit integer. Returned value is between 0
 876  * and 32 (inclusive).
 877  */
 878 static inline uint32_t
 879 BIT_LENGTH(uint32_t x)
 880 {
 881         uint32_t k, c;
 882
 883         k = NEQ(x, 0);
 884         c = GT(x, 0xFFFF); x = MUX(c, x >> 16, x); k += c << 4;
 885         c = GT(x, 0x00FF); x = MUX(c, x >>  8, x); k += c << 3;
 886         c = GT(x, 0x000F); x = MUX(c, x >>  4, x); k += c << 2;
 887         c = GT(x, 0x0003); x = MUX(c, x >>  2, x); k += c << 1;
 888         k += GT(x, 0x0001);
 889         return k;
 890 }
 891
 892 /*
 893  * Compute the minimum of x and y.
 894  */
 895 static inline uint32_t
 896 MIN(uint32_t x, uint32_t y)
 897 {
 898         return MUX(GT(x, y), y, x);
 899 }
 900
 901 /*
 902  * Compute the maximum of x and y.
 903  */
 904 static inline uint32_t
 905 MAX(uint32_t x, uint32_t y)
 906 {
 907         return MUX(GT(x, y), x, y);
 908 }
 909
 910 /*
 911  * Multiply two 32-bit integers, with a 64-bit result. This default
 912  * implementation assumes that the basic multiplication operator
 913  * yields constant-time code.
 914  */
 915 #define MUL(x, y)   ((uint64_t)(x) * (uint64_t)(y))
 916
 917 #if BR_CT_MUL31
 918
 919 /*
 920  * Alternate implementation of MUL31, that will be constant-time on some
 921  * (old) platforms where the default MUL31 is not. Unfortunately, it is
 922  * also substantially slower, and yields larger code, on more modern
 923  * platforms, which is why it is deactivated by default.
 924  *
 925  * MUL31_lo() must do some extra work because on some platforms, the
 926  * _signed_ multiplication may return early if the top bits are 1.
 927  * Simply truncating (casting) the output of MUL31() would not be
 928  * sufficient, because the compiler may notice that we keep only the low
 929  * word, and then replace automatically the unsigned multiplication with
 930  * a signed multiplication opcode.
 931  */
 932 #define MUL31(x, y)   ((uint64_t)((x) | (uint32_t)0x80000000) \
 933                        * (uint64_t)((y) | (uint32_t)0x80000000) \
 934                        - ((uint64_t)(x) << 31) - ((uint64_t)(y) << 31) \
 935                        - ((uint64_t)1 << 62))
 936 static inline uint32_t
 937 MUL31_lo(uint32_t x, uint32_t y)
 938 {
 939         uint32_t xl, xh;
 940         uint32_t yl, yh;
 941
 942         xl = (x & 0xFFFF) | (uint32_t)0x80000000;
 943         xh = (x >> 16) | (uint32_t)0x80000000;
 944         yl = (y & 0xFFFF) | (uint32_t)0x80000000;
 945         yh = (y >> 16) | (uint32_t)0x80000000;
 946         return (xl * yl + ((xl * yh + xh * yl) << 16)) & (uint32_t)0x7FFFFFFF;
 947 }
 948
 949 #else
 950
 951 /*
 952  * Multiply two 31-bit integers, with a 62-bit result. This default
 953  * implementation assumes that the basic multiplication operator
 954  * yields constant-time code.
 955  * The MUL31_lo() macro returns only the low 31 bits of the product.
 956  */
 957 #define MUL31(x, y)     ((uint64_t)(x) * (uint64_t)(y))
 958 #define MUL31_lo(x, y)  (((uint32_t)(x) * (uint32_t)(y)) & (uint32_t)0x7FFFFFFF)
 959
 960 #endif
 961
 962 /*
 963  * Multiply two words together; the sum of the lengths of the two
 964  * operands must not exceed 31 (for instance, one operand may use 16
 965  * bits if the other fits on 15). If BR_CT_MUL15 is non-zero, then the
 966  * macro will contain some extra operations that help in making the
 967  * operation constant-time on some platforms, where the basic 32-bit
 968  * multiplication is not constant-time.
 969  */
 970 #if BR_CT_MUL15
 971 #define MUL15(x, y)   (((uint32_t)(x) | (uint32_t)0x80000000) \
 972                        * ((uint32_t)(y) | (uint32_t)0x80000000) \
 973                        & (uint32_t)0x7FFFFFFF)
 974 #else
 975 #define MUL15(x, y)   ((uint32_t)(x) * (uint32_t)(y))
 976 #endif
 977
 978 /*
 979  * Arithmetic right shift (sign bit is copied). What happens when
 980  * right-shifting a negative value is _implementation-defined_, so it
 981  * does not trigger undefined behaviour, but it is still up to each
 982  * compiler to define (and document) what it does. Most/all compilers
 983  * will do an arithmetic shift, the sign bit being used to fill the
 984  * holes; this is a native operation on the underlying CPU, and it would
 985  * make little sense for the compiler to do otherwise. GCC explicitly
 986  * documents that it follows that convention.
 987  *
 988  * Still, if BR_NO_ARITH_SHIFT is defined (and non-zero), then an
 989  * alternate version will be used, that does not rely on such
 990  * implementation-defined behaviour. Unfortunately, it is also slower
 991  * and yields bigger code, which is why it is deactivated by default.
 992  */
 993 #if BR_NO_ARITH_SHIFT
 994 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
 995                       | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
 996 #else
 997 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
 998 #endif
 999
1000 /*
1001  * Constant-time division. The dividend hi:lo is divided by the
1002  * divisor d; the quotient is returned and the remainder is written
1003  * in *r. If hi == d, then the quotient does not fit on 32 bits;
1004  * returned value is thus truncated. If hi > d, returned values are
1005  * indeterminate.
1006  */
1007 uint32_t br_divrem(uint32_t hi, uint32_t lo, uint32_t d, uint32_t *r);
1008
1009 /*
1010  * Wrapper for br_divrem(); the remainder is returned, and the quotient
1011  * is discarded.
1012  */
1013 static inline uint32_t
1014 br_rem(uint32_t hi, uint32_t lo, uint32_t d)
1015 {
1016         uint32_t r;
1017
1018         br_divrem(hi, lo, d, &r);
1019         return r;
1020 }
1021
1022 /*
1023  * Wrapper for br_divrem(); the quotient is returned, and the remainder
1024  * is discarded.
1025  */
1026 static inline uint32_t
1027 br_div(uint32_t hi, uint32_t lo, uint32_t d)
1028 {
1029         uint32_t r;
1030
1031         return br_divrem(hi, lo, d, &r);
1032 }
1033
1034 /* ==================================================================== */
1035
1036 /*
1037  * Integers 'i32'
1038  * --------------
1039  *
1040  * The 'i32' functions implement computations on big integers using
1041  * an internal representation as an array of 32-bit integers. For
1042  * an array x[]:
1043  *  -- x[0] contains the "announced bit length" of the integer
1044  *  -- x[1], x[2]... contain the value in little-endian order (x[1]
1045  *     contains the least significant 32 bits)
1046  *
1047  * Multiplications rely on the elementary 32x32->64 multiplication.
1048  *
1049  * The announced bit length specifies the number of bits that are
1050  * significant in the subsequent 32-bit words. Unused bits in the
1051  * last (most significant) word are set to 0; subsequent words are
1052  * uninitialized and need not exist at all.
1053  *
1054  * The execution time and memory access patterns of all computations
1055  * depend on the announced bit length, but not on the actual word
1056  * values. For modular integers, the announced bit length of any integer
1057  * modulo n is equal to the actual bit length of n; thus, computations
1058  * on modular integers are "constant-time" (only the modulus length may
1059  * leak).
1060  */
1061
1062 /*
1063  * Compute the actual bit length of an integer. The argument x should
1064  * point to the first (least significant) value word of the integer.
1065  * The len 'xlen' contains the number of 32-bit words to access.
1066  *
1067  * CT: value or length of x does not leak.
1068  */
1069 uint32_t br_i32_bit_length(uint32_t *x, size_t xlen);
1070
1071 /*
1072  * Decode an integer from its big-endian unsigned representation. The
1073  * "true" bit length of the integer is computed, but all words of x[]
1074  * corresponding to the full 'len' bytes of the source are set.
1075  *
1076  * CT: value or length of x does not leak.
1077  */
1078 void br_i32_decode(uint32_t *x, const void *src, size_t len);
1079
1080 /*
1081  * Decode an integer from its big-endian unsigned representation. The
1082  * integer MUST be lower than m[]; the announced bit length written in
1083  * x[] will be equal to that of m[]. All 'len' bytes from the source are
1084  * read.
1085  *
1086  * Returned value is 1 if the decode value fits within the modulus, 0
1087  * otherwise. In the latter case, the x[] buffer will be set to 0 (but
1088  * still with the announced bit length of m[]).
1089  *
1090  * CT: value or length of x does not leak. Memory access pattern depends
1091  * only of 'len' and the announced bit length of m. Whether x fits or
1092  * not does not leak either.
1093  */
1094 uint32_t br_i32_decode_mod(uint32_t *x,
1095         const void *src, size_t len, const uint32_t *m);
1096
1097 /*
1098  * Reduce an integer (a[]) modulo another (m[]). The result is written
1099  * in x[] and its announced bit length is set to be equal to that of m[].
1100  *
1101  * x[] MUST be distinct from a[] and m[].
1102  *
1103  * CT: only announced bit lengths leak, not values of x, a or m.
1104  */
1105 void br_i32_reduce(uint32_t *x, const uint32_t *a, const uint32_t *m);
1106
1107 /*
1108  * Decode an integer from its big-endian unsigned representation, and
1109  * reduce it modulo the provided modulus m[]. The announced bit length
1110  * of the result is set to be equal to that of the modulus.
1111  *
1112  * x[] MUST be distinct from m[].
1113  */
1114 void br_i32_decode_reduce(uint32_t *x,
1115         const void *src, size_t len, const uint32_t *m);
1116
1117 /*
1118  * Encode an integer into its big-endian unsigned representation. The
1119  * output length in bytes is provided (parameter 'len'); if the length
1120  * is too short then the integer is appropriately truncated; if it is
1121  * too long then the extra bytes are set to 0.
1122  */
1123 void br_i32_encode(void *dst, size_t len, const uint32_t *x);
1124
1125 /*
1126  * Multiply x[] by 2^32 and then add integer z, modulo m[]. This
1127  * function assumes that x[] and m[] have the same announced bit
1128  * length, and the announced bit length of m[] matches its true
1129  * bit length.
1130  *
1131  * x[] and m[] MUST be distinct arrays.
1132  *
1133  * CT: only the common announced bit length of x and m leaks, not
1134  * the values of x, z or m.
1135  */
1136 void br_i32_muladd_small(uint32_t *x, uint32_t z, const uint32_t *m);
1137
1138 /*
1139  * Extract one word from an integer. The offset is counted in bits.
1140  * The word MUST entirely fit within the word elements corresponding
1141  * to the announced bit length of a[].
1142  */
1143 static inline uint32_t
1144 br_i32_word(const uint32_t *a, uint32_t off)
1145 {
1146         size_t u;
1147         unsigned j;
1148
1149         u = (size_t)(off >> 5) + 1;
1150         j = (unsigned)off & 31;
1151         if (j == 0) {
1152                 return a[u];
1153         } else {
1154                 return (a[u] >> j) | (a[u + 1] << (32 - j));
1155         }
1156 }
1157
1158 /*
1159  * Test whether an integer is zero.
1160  */
1161 uint32_t br_i32_iszero(const uint32_t *x);
1162
1163 /*
1164  * Add b[] to a[] and return the carry (0 or 1). If ctl is 0, then a[]
1165  * is unmodified, but the carry is still computed and returned. The
1166  * arrays a[] and b[] MUST have the same announced bit length.
1167  *
1168  * a[] and b[] MAY be the same array, but partial overlap is not allowed.
1169  */
1170 uint32_t br_i32_add(uint32_t *a, const uint32_t *b, uint32_t ctl);
1171
1172 /*
1173  * Subtract b[] from a[] and return the carry (0 or 1). If ctl is 0,
1174  * then a[] is unmodified, but the carry is still computed and returned.
1175  * The arrays a[] and b[] MUST have the same announced bit length.
1176  *
1177  * a[] and b[] MAY be the same array, but partial overlap is not allowed.
1178  */
1179 uint32_t br_i32_sub(uint32_t *a, const uint32_t *b, uint32_t ctl);
1180
1181 /*
1182  * Compute d+a*b, result in d. The initial announced bit length of d[]
1183  * MUST match that of a[]. The d[] array MUST be large enough to
1184  * accommodate the full result, plus (possibly) an extra word. The
1185  * resulting announced bit length of d[] will be the sum of the announced
1186  * bit lengths of a[] and b[] (therefore, it may be larger than the actual
1187  * bit length of the numerical result).
1188  *
1189  * a[] and b[] may be the same array. d[] must be disjoint from both a[]
1190  * and b[].
1191  */
1192 void br_i32_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b);
1193
1194 /*
1195  * Zeroize an integer. The announced bit length is set to the provided
1196  * value, and the corresponding words are set to 0.
1197  */
1198 static inline void
1199 br_i32_zero(uint32_t *x, uint32_t bit_len)
1200 {
1201         *x ++ = bit_len;
1202         memset(x, 0, ((bit_len + 31) >> 5) * sizeof *x);
1203 }
1204
1205 /*
1206  * Compute -(1/x) mod 2^32. If x is even, then this function returns 0.
1207  */
1208 uint32_t br_i32_ninv32(uint32_t x);
1209
1210 /*
1211  * Convert a modular integer to Montgomery representation. The integer x[]
1212  * MUST be lower than m[], but with the same announced bit length.
1213  */
1214 void br_i32_to_monty(uint32_t *x, const uint32_t *m);
1215
1216 /*
1217  * Convert a modular integer back from Montgomery representation. The
1218  * integer x[] MUST be lower than m[], but with the same announced bit
1219  * length. The "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is
1220  * the least significant value word of m[] (this works only if m[] is
1221  * an odd integer).
1222  */
1223 void br_i32_from_monty(uint32_t *x, const uint32_t *m, uint32_t m0i);
1224
1225 /*
1226  * Compute a modular Montgomery multiplication. d[] is filled with the
1227  * value of x*y/R modulo m[] (where R is the Montgomery factor). The
1228  * array d[] MUST be distinct from x[], y[] and m[]. x[] and y[] MUST be
1229  * numerically lower than m[]. x[] and y[] MAY be the same array. The
1230  * "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is the least
1231  * significant value word of m[] (this works only if m[] is an odd
1232  * integer).
1233  */
1234 void br_i32_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
1235         const uint32_t *m, uint32_t m0i);
1236
1237 /*
1238  * Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1239  * (same announced bit length, lower value). m[] MUST be odd. The
1240  * exponent is in big-endian unsigned notation, over 'elen' bytes. The
1241  * "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is the least
1242  * significant value word of m[] (this works only if m[] is an odd
1243  * integer). The t1[] and t2[] parameters must be temporary arrays,
1244  * each large enough to accommodate an integer with the same size as m[].
1245  */
1246 void br_i32_modpow(uint32_t *x, const unsigned char *e, size_t elen,
1247         const uint32_t *m, uint32_t m0i, uint32_t *t1, uint32_t *t2);
1248
1249 /* ==================================================================== */
1250
1251 /*
1252  * Integers 'i31'
1253  * --------------
1254  *
1255  * The 'i31' functions implement computations on big integers using
1256  * an internal representation as an array of 32-bit integers. For
1257  * an array x[]:
1258  *  -- x[0] encodes the array length and the "announced bit length"
1259  *     of the integer: namely, if the announced bit length is k,
1260  *     then x[0] = ((k / 31) << 5) + (k % 31).
1261  *  -- x[1], x[2]... contain the value in little-endian order, 31
1262  *     bits per word (x[1] contains the least significant 31 bits).
1263  *     The upper bit of each word is 0.
1264  *
1265  * Multiplications rely on the elementary 32x32->64 multiplication.
1266  *
1267  * The announced bit length specifies the number of bits that are
1268  * significant in the subsequent 32-bit words. Unused bits in the
1269  * last (most significant) word are set to 0; subsequent words are
1270  * uninitialized and need not exist at all.
1271  *
1272  * The execution time and memory access patterns of all computations
1273  * depend on the announced bit length, but not on the actual word
1274  * values. For modular integers, the announced bit length of any integer
1275  * modulo n is equal to the actual bit length of n; thus, computations
1276  * on modular integers are "constant-time" (only the modulus length may
1277  * leak).
1278  */
1279
1280 /*
1281  * Test whether an integer is zero.
1282  */
1283 uint32_t br_i31_iszero(const uint32_t *x);
1284
1285 /*
1286  * Add b[] to a[] and return the carry (0 or 1). If ctl is 0, then a[]
1287  * is unmodified, but the carry is still computed and returned. The
1288  * arrays a[] and b[] MUST have the same announced bit length.
1289  *
1290  * a[] and b[] MAY be the same array, but partial overlap is not allowed.
1291  */
1292 uint32_t br_i31_add(uint32_t *a, const uint32_t *b, uint32_t ctl);
1293
1294 /*
1295  * Subtract b[] from a[] and return the carry (0 or 1). If ctl is 0,
1296  * then a[] is unmodified, but the carry is still computed and returned.
1297  * The arrays a[] and b[] MUST have the same announced bit length.
1298  *
1299  * a[] and b[] MAY be the same array, but partial overlap is not allowed.
1300  */
1301 uint32_t br_i31_sub(uint32_t *a, const uint32_t *b, uint32_t ctl);
1302
1303 /*
1304  * Compute the ENCODED actual bit length of an integer. The argument x
1305  * should point to the first (least significant) value word of the
1306  * integer. The len 'xlen' contains the number of 32-bit words to
1307  * access. The upper bit of each value word MUST be 0.
1308  * Returned value is ((k / 31) << 5) + (k % 31) if the bit length is k.
1309  *
1310  * CT: value or length of x does not leak.
1311  */
1312 uint32_t br_i31_bit_length(uint32_t *x, size_t xlen);
1313
1314 /*
1315  * Decode an integer from its big-endian unsigned representation. The
1316  * "true" bit length of the integer is computed and set in the encoded
1317  * announced bit length (x[0]), but all words of x[] corresponding to
1318  * the full 'len' bytes of the source are set.
1319  *
1320  * CT: value or length of x does not leak.
1321  */
1322 void br_i31_decode(uint32_t *x, const void *src, size_t len);
1323
1324 /*
1325  * Decode an integer from its big-endian unsigned representation. The
1326  * integer MUST be lower than m[]; the (encoded) announced bit length
1327  * written in x[] will be equal to that of m[]. All 'len' bytes from the
1328  * source are read.
1329  *
1330  * Returned value is 1 if the decode value fits within the modulus, 0
1331  * otherwise. In the latter case, the x[] buffer will be set to 0 (but
1332  * still with the announced bit length of m[]).
1333  *
1334  * CT: value or length of x does not leak. Memory access pattern depends
1335  * only of 'len' and the announced bit length of m. Whether x fits or
1336  * not does not leak either.
1337  */
1338 uint32_t br_i31_decode_mod(uint32_t *x,
1339         const void *src, size_t len, const uint32_t *m);
1340
1341 /*
1342  * Zeroize an integer. The announced bit length is set to the provided
1343  * value, and the corresponding words are set to 0. The ENCODED bit length
1344  * is expected here.
1345  */
1346 static inline void
1347 br_i31_zero(uint32_t *x, uint32_t bit_len)
1348 {
1349         *x ++ = bit_len;
1350         memset(x, 0, ((bit_len + 31) >> 5) * sizeof *x);
1351 }
1352
1353 /*
1354  * Right-shift an integer. The shift amount must be lower than 31
1355  * bits.
1356  */
1357 void br_i31_rshift(uint32_t *x, int count);
1358
1359 /*
1360  * Reduce an integer (a[]) modulo another (m[]). The result is written
1361  * in x[] and its announced bit length is set to be equal to that of m[].
1362  *
1363  * x[] MUST be distinct from a[] and m[].
1364  *
1365  * CT: only announced bit lengths leak, not values of x, a or m.
1366  */
1367 void br_i31_reduce(uint32_t *x, const uint32_t *a, const uint32_t *m);
1368
1369 /*
1370  * Decode an integer from its big-endian unsigned representation, and
1371  * reduce it modulo the provided modulus m[]. The announced bit length
1372  * of the result is set to be equal to that of the modulus.
1373  *
1374  * x[] MUST be distinct from m[].
1375  */
1376 void br_i31_decode_reduce(uint32_t *x,
1377         const void *src, size_t len, const uint32_t *m);
1378
1379 /*
1380  * Multiply x[] by 2^31 and then add integer z, modulo m[]. This
1381  * function assumes that x[] and m[] have the same announced bit
1382  * length, the announced bit length of m[] matches its true
1383  * bit length.
1384  *
1385  * x[] and m[] MUST be distinct arrays. z MUST fit in 31 bits (upper
1386  * bit set to 0).
1387  *
1388  * CT: only the common announced bit length of x and m leaks, not
1389  * the values of x, z or m.
1390  */
1391 void br_i31_muladd_small(uint32_t *x, uint32_t z, const uint32_t *m);
1392
1393 /*
1394  * Encode an integer into its big-endian unsigned representation. The
1395  * output length in bytes is provided (parameter 'len'); if the length
1396  * is too short then the integer is appropriately truncated; if it is
1397  * too long then the extra bytes are set to 0.
1398  */
1399 void br_i31_encode(void *dst, size_t len, const uint32_t *x);
1400
1401 /*
1402  * Compute -(1/x) mod 2^31. If x is even, then this function returns 0.
1403  */
1404 uint32_t br_i31_ninv31(uint32_t x);
1405
1406 /*
1407  * Compute a modular Montgomery multiplication. d[] is filled with the
1408  * value of x*y/R modulo m[] (where R is the Montgomery factor). The
1409  * array d[] MUST be distinct from x[], y[] and m[]. x[] and y[] MUST be
1410  * numerically lower than m[]. x[] and y[] MAY be the same array. The
1411  * "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1412  * significant value word of m[] (this works only if m[] is an odd
1413  * integer).
1414  */
1415 void br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
1416         const uint32_t *m, uint32_t m0i);
1417
1418 /*
1419  * Convert a modular integer to Montgomery representation. The integer x[]
1420  * MUST be lower than m[], but with the same announced bit length.
1421  */
1422 void br_i31_to_monty(uint32_t *x, const uint32_t *m);
1423
1424 /*
1425  * Convert a modular integer back from Montgomery representation. The
1426  * integer x[] MUST be lower than m[], but with the same announced bit
1427  * length. The "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is
1428  * the least significant value word of m[] (this works only if m[] is
1429  * an odd integer).
1430  */
1431 void br_i31_from_monty(uint32_t *x, const uint32_t *m, uint32_t m0i);
1432
1433 /*
1434  * Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1435  * (same announced bit length, lower value). m[] MUST be odd. The
1436  * exponent is in big-endian unsigned notation, over 'elen' bytes. The
1437  * "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1438  * significant value word of m[] (this works only if m[] is an odd
1439  * integer). The t1[] and t2[] parameters must be temporary arrays,
1440  * each large enough to accommodate an integer with the same size as m[].
1441  */
1442 void br_i31_modpow(uint32_t *x, const unsigned char *e, size_t elen,
1443         const uint32_t *m, uint32_t m0i, uint32_t *t1, uint32_t *t2);
1444
1445 /*
1446  * Compute a modular exponentiation. x[] MUST be an integer modulo m[]
1447  * (same announced bit length, lower value). m[] MUST be odd. The
1448  * exponent is in big-endian unsigned notation, over 'elen' bytes. The
1449  * "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
1450  * significant value word of m[] (this works only if m[] is an odd
1451  * integer). The tmp[] array is used for temporaries, and has size
1452  * 'twlen' words; it must be large enough to accommodate at least two
1453  * temporary values with the same size as m[] (including the leading
1454  * "bit length" word). If there is room for more temporaries, then this
1455  * function may use the extra room for window-based optimisation,
1456  * resulting in faster computations.
1457  *
1458  * Returned value is 1 on success, 0 on error. An error is reported if
1459  * the provided tmp[] array is too short.
1460  */
1461 uint32_t br_i31_modpow_opt(uint32_t *x, const unsigned char *e, size_t elen,
1462         const uint32_t *m, uint32_t m0i, uint32_t *tmp, size_t twlen);
1463
1464 /*
1465  * Compute d+a*b, result in d. The initial announced bit length of d[]
1466  * MUST match that of a[]. The d[] array MUST be large enough to
1467  * accommodate the full result, plus (possibly) an extra word. The
1468  * resulting announced bit length of d[] will be the sum of the announced
1469  * bit lengths of a[] and b[] (therefore, it may be larger than the actual
1470  * bit length of the numerical result).
1471  *
1472  * a[] and b[] may be the same array. d[] must be disjoint from both a[]
1473  * and b[].
1474  */
1475 void br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b);
1476
1477 /* ==================================================================== */
1478
1479 /*
1480  * FIXME: document "i15" functions.
1481  */
1482
1483 static inline void
1484 br_i15_zero(uint16_t *x, uint16_t bit_len)
1485 {
1486         *x ++ = bit_len;
1487         memset(x, 0, ((bit_len + 15) >> 4) * sizeof *x);
1488 }
1489
1490 uint32_t br_i15_iszero(const uint16_t *x);
1491
1492 uint16_t br_i15_ninv15(uint16_t x);
1493
1494 uint32_t br_i15_add(uint16_t *a, const uint16_t *b, uint32_t ctl);
1495
1496 uint32_t br_i15_sub(uint16_t *a, const uint16_t *b, uint32_t ctl);
1497
1498 void br_i15_muladd_small(uint16_t *x, uint16_t z, const uint16_t *m);
1499
1500 void br_i15_montymul(uint16_t *d, const uint16_t *x, const uint16_t *y,
1501         const uint16_t *m, uint16_t m0i);
1502
1503 void br_i15_to_monty(uint16_t *x, const uint16_t *m);
1504
1505 void br_i15_modpow(uint16_t *x, const unsigned char *e, size_t elen,
1506         const uint16_t *m, uint16_t m0i, uint16_t *t1, uint16_t *t2);
1507
1508 uint32_t br_i15_modpow_opt(uint16_t *x, const unsigned char *e, size_t elen,
1509         const uint16_t *m, uint16_t m0i, uint16_t *tmp, size_t twlen);
1510
1511 void br_i15_encode(void *dst, size_t len, const uint16_t *x);
1512
1513 uint32_t br_i15_decode_mod(uint16_t *x,
1514         const void *src, size_t len, const uint16_t *m);
1515
1516 void br_i15_rshift(uint16_t *x, int count);
1517
1518 uint32_t br_i15_bit_length(uint16_t *x, size_t xlen);
1519
1520 void br_i15_decode(uint16_t *x, const void *src, size_t len);
1521
1522 void br_i15_from_monty(uint16_t *x, const uint16_t *m, uint16_t m0i);
1523
1524 void br_i15_decode_reduce(uint16_t *x,
1525         const void *src, size_t len, const uint16_t *m);
1526
1527 void br_i15_reduce(uint16_t *x, const uint16_t *a, const uint16_t *m);
1528
1529 void br_i15_mulacc(uint16_t *d, const uint16_t *a, const uint16_t *b);
1530
1531 uint32_t br_i62_modpow_opt(uint32_t *x31, const unsigned char *e, size_t elen,
1532         const uint32_t *m31, uint32_t m0i31, uint64_t *tmp, size_t twlen);
1533
1534 /* ==================================================================== */
1535
1536 static inline size_t
1537 br_digest_size(const br_hash_class *digest_class)
1538 {
1539         return (size_t)(digest_class->desc >> BR_HASHDESC_OUT_OFF)
1540                 & BR_HASHDESC_OUT_MASK;
1541 }
1542
1543 /*
1544  * Get the output size (in bytes) of a hash function.
1545  */
1546 size_t br_digest_size_by_ID(int digest_id);
1547
1548 /*
1549  * Get the OID (encoded OBJECT IDENTIFIER value, without tag and length)
1550  * for a hash function. If digest_id is not a supported digest identifier
1551  * (in particular if it is equal to 0, i.e. br_md5sha1_ID), then NULL is
1552  * returned and *len is set to 0.
1553  */
1554 const unsigned char *br_digest_OID(int digest_id, size_t *len);
1555
1556 /* ==================================================================== */
1557 /*
1558  * DES support functions.
1559  */
1560
1561 /*
1562  * Apply DES Initial Permutation.
1563  */
1564 void br_des_do_IP(uint32_t *xl, uint32_t *xr);
1565
1566 /*
1567  * Apply DES Final Permutation (inverse of IP).
1568  */
1569 void br_des_do_invIP(uint32_t *xl, uint32_t *xr);
1570
1571 /*
1572  * Key schedule unit: for a DES key (8 bytes), compute 16 subkeys. Each
1573  * subkey is two 28-bit words represented as two 32-bit words; the PC-2
1574  * bit extration is NOT applied.
1575  */
1576 void br_des_keysched_unit(uint32_t *skey, const void *key);
1577
1578 /*
1579  * Reversal of 16 DES sub-keys (for decryption).
1580  */
1581 void br_des_rev_skey(uint32_t *skey);
1582
1583 /*
1584  * DES/3DES key schedule for 'des_tab' (encryption direction). Returned
1585  * value is the number of rounds.
1586  */
1587 unsigned br_des_tab_keysched(uint32_t *skey, const void *key, size_t key_len);
1588
1589 /*
1590  * DES/3DES key schedule for 'des_ct' (encryption direction). Returned
1591  * value is the number of rounds.
1592  */
1593 unsigned br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len);
1594
1595 /*
1596  * DES/3DES subkey decompression (from the compressed bitsliced subkeys).
1597  */
1598 void br_des_ct_skey_expand(uint32_t *sk_exp,
1599         unsigned num_rounds, const uint32_t *skey);
1600
1601 /*
1602  * DES/3DES block encryption/decryption ('des_tab').
1603  */
1604 void br_des_tab_process_block(unsigned num_rounds,
1605         const uint32_t *skey, void *block);
1606
1607 /*
1608  * DES/3DES block encryption/decryption ('des_ct').
1609  */
1610 void br_des_ct_process_block(unsigned num_rounds,
1611         const uint32_t *skey, void *block);
1612
1613 /* ==================================================================== */
1614 /*
1615  * AES support functions.
1616  */
1617
1618 /*
1619  * The AES S-box (256-byte table).
1620  */
1621 extern const unsigned char br_aes_S[];
1622
1623 /*
1624  * AES key schedule. skey[] is filled with n+1 128-bit subkeys, where n
1625  * is the number of rounds (10 to 14, depending on key size). The number
1626  * of rounds is returned. If the key size is invalid (not 16, 24 or 32),
1627  * then 0 is returned.
1628  *
1629  * This implementation uses a 256-byte table and is NOT constant-time.
1630  */
1631 unsigned br_aes_keysched(uint32_t *skey, const void *key, size_t key_len);
1632
1633 /*
1634  * AES key schedule for decryption ('aes_big' implementation).
1635  */
1636 unsigned br_aes_big_keysched_inv(uint32_t *skey,
1637         const void *key, size_t key_len);
1638
1639 /*
1640  * AES block encryption with the 'aes_big' implementation (fast, but
1641  * not constant-time). This function encrypts a single block "in place".
1642  */
1643 void br_aes_big_encrypt(unsigned num_rounds, const uint32_t *skey, void *data);
1644
1645 /*
1646  * AES block decryption with the 'aes_big' implementation (fast, but
1647  * not constant-time). This function decrypts a single block "in place".
1648  */
1649 void br_aes_big_decrypt(unsigned num_rounds, const uint32_t *skey, void *data);
1650
1651 /*
1652  * AES block encryption with the 'aes_small' implementation (small, but
1653  * slow and not constant-time). This function encrypts a single block
1654  * "in place".
1655  */
1656 void br_aes_small_encrypt(unsigned num_rounds,
1657         const uint32_t *skey, void *data);
1658
1659 /*
1660  * AES block decryption with the 'aes_small' implementation (small, but
1661  * slow and not constant-time). This function decrypts a single block
1662  * "in place".
1663  */
1664 void br_aes_small_decrypt(unsigned num_rounds,
1665         const uint32_t *skey, void *data);
1666
1667 /*
1668  * The constant-time implementation is "bitsliced": the 128-bit state is
1669  * split over eight 32-bit words q* in the following way:
1670  *
1671  * -- Input block consists in 16 bytes:
1672  *    a00 a10 a20 a30 a01 a11 a21 a31 a02 a12 a22 a32 a03 a13 a23 a33
1673  * In the terminology of FIPS 197, this is a 4x4 matrix which is read
1674  * column by column.
1675  *
1676  * -- Each byte is split into eight bits which are distributed over the
1677  * eight words, at the same rank. Thus, for a byte x at rank k, bit 0
1678  * (least significant) of x will be at rank k in q0 (if that bit is b,
1679  * then it contributes "b << k" to the value of q0), bit 1 of x will be
1680  * at rank k in q1, and so on.
1681  *
1682  * -- Ranks given to bits are in "row order" and are either all even, or
1683  * all odd. Two independent AES states are thus interleaved, one using
1684  * the even ranks, the other the odd ranks. Row order means:
1685  *    a00 a01 a02 a03 a10 a11 a12 a13 a20 a21 a22 a23 a30 a31 a32 a33
1686  *
1687  * Converting input bytes from two AES blocks to bitslice representation
1688  * is done in the following way:
1689  * -- Decode first block into the four words q0 q2 q4 q6, in that order,
1690  * using little-endian convention.
1691  * -- Decode second block into the four words q1 q3 q5 q7, in that order,
1692  * using little-endian convention.
1693  * -- Call br_aes_ct_ortho().
1694  *
1695  * Converting back to bytes is done by using the reverse operations. Note
1696  * that br_aes_ct_ortho() is its own inverse.
1697  */
1698
1699 /*
1700  * Perform bytewise orthogonalization of eight 32-bit words. Bytes
1701  * of q0..q7 are spread over all words: for a byte x that occurs
1702  * at rank i in q[j] (byte x uses bits 8*i to 8*i+7 in q[j]), the bit
1703  * of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j.
1704  *
1705  * This operation is an involution.
1706  */
1707 void br_aes_ct_ortho(uint32_t *q);
1708
1709 /*
1710  * The AES S-box, as a bitsliced constant-time version. The input array
1711  * consists in eight 32-bit words; 32 S-box instances are computed in
1712  * parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant)
1713  * are spread over the words 0 to 7, at the same rank.
1714  */
1715 void br_aes_ct_bitslice_Sbox(uint32_t *q);
1716
1717 /*
1718  * Like br_aes_bitslice_Sbox(), but for the inverse S-box.
1719  */
1720 void br_aes_ct_bitslice_invSbox(uint32_t *q);
1721
1722 /*
1723  * Compute AES encryption on bitsliced data. Since input is stored on
1724  * eight 32-bit words, two block encryptions are actually performed
1725  * in parallel.
1726  */
1727 void br_aes_ct_bitslice_encrypt(unsigned num_rounds,
1728         const uint32_t *skey, uint32_t *q);
1729
1730 /*
1731  * Compute AES decryption on bitsliced data. Since input is stored on
1732  * eight 32-bit words, two block decryptions are actually performed
1733  * in parallel.
1734  */
1735 void br_aes_ct_bitslice_decrypt(unsigned num_rounds,
1736         const uint32_t *skey, uint32_t *q);
1737
1738 /*
1739  * AES key schedule, constant-time version. skey[] is filled with n+1
1740  * 128-bit subkeys, where n is the number of rounds (10 to 14, depending
1741  * on key size). The number of rounds is returned. If the key size is
1742  * invalid (not 16, 24 or 32), then 0 is returned.
1743  */
1744 unsigned br_aes_ct_keysched(uint32_t *comp_skey,
1745         const void *key, size_t key_len);
1746
1747 /*
1748  * Expand AES subkeys as produced by br_aes_ct_keysched(), into
1749  * a larger array suitable for br_aes_ct_bitslice_encrypt() and
1750  * br_aes_ct_bitslice_decrypt().
1751  */
1752 void br_aes_ct_skey_expand(uint32_t *skey,
1753         unsigned num_rounds, const uint32_t *comp_skey);
1754
1755 /*
1756  * For the ct64 implementation, the same bitslicing technique is used,
1757  * but four instances are interleaved. First instance uses bits 0, 4,
1758  * 8, 12,... of each word; second instance uses bits 1, 5, 9, 13,...
1759  * and so on.
1760  */
1761
1762 /*
1763  * Perform bytewise orthogonalization of eight 64-bit words. Bytes
1764  * of q0..q7 are spread over all words: for a byte x that occurs
1765  * at rank i in q[j] (byte x uses bits 8*i to 8*i+7 in q[j]), the bit
1766  * of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j.
1767  *
1768  * This operation is an involution.
1769  */
1770 void br_aes_ct64_ortho(uint64_t *q);
1771
1772 /*
1773  * Interleave bytes for an AES input block. If input bytes are
1774  * denoted 0123456789ABCDEF, and have been decoded with little-endian
1775  * convention (w[0] contains 0123, with '3' being most significant;
1776  * w[1] contains 4567, and so on), then output word q0 will be
1777  * set to 08192A3B (again little-endian convention) and q1 will
1778  * be set to 4C5D6E7F.
1779  */
1780 void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w);
1781
1782 /*
1783  * Perform the opposite of br_aes_ct64_interleave_in().
1784  */
1785 void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1);
1786
1787 /*
1788  * The AES S-box, as a bitsliced constant-time version. The input array
1789  * consists in eight 64-bit words; 64 S-box instances are computed in
1790  * parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant)
1791  * are spread over the words 0 to 7, at the same rank.
1792  */
1793 void br_aes_ct64_bitslice_Sbox(uint64_t *q);
1794
1795 /*
1796  * Like br_aes_bitslice_Sbox(), but for the inverse S-box.
1797  */
1798 void br_aes_ct64_bitslice_invSbox(uint64_t *q);
1799
1800 /*
1801  * Compute AES encryption on bitsliced data. Since input is stored on
1802  * eight 64-bit words, four block encryptions are actually performed
1803  * in parallel.
1804  */
1805 void br_aes_ct64_bitslice_encrypt(unsigned num_rounds,
1806         const uint64_t *skey, uint64_t *q);
1807
1808 /*
1809  * Compute AES decryption on bitsliced data. Since input is stored on
1810  * eight 64-bit words, four block decryptions are actually performed
1811  * in parallel.
1812  */
1813 void br_aes_ct64_bitslice_decrypt(unsigned num_rounds,
1814         const uint64_t *skey, uint64_t *q);
1815
1816 /*
1817  * AES key schedule, constant-time version. skey[] is filled with n+1
1818  * 128-bit subkeys, where n is the number of rounds (10 to 14, depending
1819  * on key size). The number of rounds is returned. If the key size is
1820  * invalid (not 16, 24 or 32), then 0 is returned.
1821  */
1822 unsigned br_aes_ct64_keysched(uint64_t *comp_skey,
1823         const void *key, size_t key_len);
1824
1825 /*
1826  * Expand AES subkeys as produced by br_aes_ct64_keysched(), into
1827  * a larger array suitable for br_aes_ct64_bitslice_encrypt() and
1828  * br_aes_ct64_bitslice_decrypt().
1829  */
1830 void br_aes_ct64_skey_expand(uint64_t *skey,
1831         unsigned num_rounds, const uint64_t *comp_skey);
1832
1833 /*
1834  * Test support for AES-NI opcodes.
1835  */
1836 int br_aes_x86ni_supported(void);
1837
1838 /*
1839  * AES key schedule, using x86 AES-NI instructions. This yields the
1840  * subkeys in the encryption direction. Number of rounds is returned.
1841  * Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1842  */
1843 unsigned br_aes_x86ni_keysched_enc(unsigned char *skni,
1844         const void *key, size_t len);
1845
1846 /*
1847  * AES key schedule, using x86 AES-NI instructions. This yields the
1848  * subkeys in the decryption direction. Number of rounds is returned.
1849  * Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1850  */
1851 unsigned br_aes_x86ni_keysched_dec(unsigned char *skni,
1852         const void *key, size_t len);
1853
1854 /*
1855  * Test support for AES POWER8 opcodes.
1856  */
1857 int br_aes_pwr8_supported(void);
1858
1859 /*
1860  * AES key schedule, using POWER8 instructions. This yields the
1861  * subkeys in the encryption direction. Number of rounds is returned.
1862  * Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
1863  */
1864 unsigned br_aes_pwr8_keysched(unsigned char *skni,
1865         const void *key, size_t len);
1866
1867 /* ==================================================================== */
1868 /*
1869  * RSA.
1870  */
1871
1872 /*
1873  * Apply proper PKCS#1 v1.5 padding (for signatures). 'hash_oid' is
1874  * the encoded hash function OID, or NULL.
1875  */
1876 uint32_t br_rsa_pkcs1_sig_pad(const unsigned char *hash_oid,
1877         const unsigned char *hash, size_t hash_len,
1878         uint32_t n_bitlen, unsigned char *x);
1879
1880 /*
1881  * Check PKCS#1 v1.5 padding (for signatures). 'hash_oid' is the encoded
1882  * hash function OID, or NULL. The provided 'sig' value is _after_ the
1883  * modular exponentiation, i.e. it should be the padded hash. On
1884  * success, the hashed message is extracted.
1885  */
1886 uint32_t br_rsa_pkcs1_sig_unpad(const unsigned char *sig, size_t sig_len,
1887         const unsigned char *hash_oid, size_t hash_len,
1888         unsigned char *hash_out);
1889
1890 /*
1891  * Apply OAEP padding. Returned value is the actual padded string length,
1892  * or zero on error.
1893  */
1894 size_t br_rsa_oaep_pad(const br_prng_class **rnd, const br_hash_class *dig,
1895         const void *label, size_t label_len, const br_rsa_public_key *pk,
1896         void *dst, size_t dst_nax_len, const void *src, size_t src_len);
1897
1898 /*
1899  * Unravel and check OAEP padding. If the padding is correct, then 1 is
1900  * returned, '*len' is adjusted to the length of the message, and the
1901  * data is moved to the start of the 'data' buffer. If the padding is
1902  * incorrect, then 0 is returned and '*len' is untouched. Either way,
1903  * the complete buffer contents are altered.
1904  */
1905 uint32_t br_rsa_oaep_unpad(const br_hash_class *dig,
1906         const void *label, size_t label_len, void *data, size_t *len);
1907
1908 /*
1909  * Compute MGF1 for a given seed, and XOR the output into the provided
1910  * buffer.
1911  */
1912 void br_mgf1_xor(void *data, size_t len,
1913         const br_hash_class *dig, const void *seed, size_t seed_len);
1914
1915 /* ==================================================================== */
1916 /*
1917  * Elliptic curves.
1918  */
1919
1920 /*
1921  * Type for generic EC parameters: curve order (unsigned big-endian
1922  * encoding) and encoded conventional generator.
1923  */
1924 typedef struct {
1925         int curve;
1926         const unsigned char *order;
1927         size_t order_len;
1928         const unsigned char *generator;
1929         size_t generator_len;
1930 } br_ec_curve_def;
1931
1932 extern const br_ec_curve_def br_secp256r1;
1933 extern const br_ec_curve_def br_secp384r1;
1934 extern const br_ec_curve_def br_secp521r1;
1935
1936 /*
1937  * For Curve25519, the advertised "order" really is 2^255-1, since the
1938  * point multipliction function really works over arbitrary 255-bit
1939  * scalars. This value is only meant as a hint for ECDH key generation;
1940  * only ECDSA uses the exact curve order, and ECDSA is not used with
1941  * that specific curve.
1942  */
1943 extern const br_ec_curve_def br_curve25519;
1944
1945 /*
1946  * Decode some bytes as an i31 integer, with truncation (corresponding
1947  * to the 'bits2int' operation in RFC 6979). The target ENCODED bit
1948  * length is provided as last parameter. The resulting value will have
1949  * this declared bit length, and consists the big-endian unsigned decoding
1950  * of exactly that many bits in the source (capped at the source length).
1951  */
1952 void br_ecdsa_i31_bits2int(uint32_t *x,
1953         const void *src, size_t len, uint32_t ebitlen);
1954
1955 /*
1956  * Decode some bytes as an i15 integer, with truncation (corresponding
1957  * to the 'bits2int' operation in RFC 6979). The target ENCODED bit
1958  * length is provided as last parameter. The resulting value will have
1959  * this declared bit length, and consists the big-endian unsigned decoding
1960  * of exactly that many bits in the source (capped at the source length).
1961  */
1962 void br_ecdsa_i15_bits2int(uint16_t *x,
1963         const void *src, size_t len, uint32_t ebitlen);
1964
1965 /* ==================================================================== */
1966 /*
1967  * SSL/TLS support functions.
1968  */
1969
1970 /*
1971  * Record types.
1972  */
1973 #define BR_SSL_CHANGE_CIPHER_SPEC    20
1974 #define BR_SSL_ALERT                 21
1975 #define BR_SSL_HANDSHAKE             22
1976 #define BR_SSL_APPLICATION_DATA      23
1977
1978 /*
1979  * Handshake message types.
1980  */
1981 #define BR_SSL_HELLO_REQUEST          0
1982 #define BR_SSL_CLIENT_HELLO           1
1983 #define BR_SSL_SERVER_HELLO           2
1984 #define BR_SSL_CERTIFICATE           11
1985 #define BR_SSL_SERVER_KEY_EXCHANGE   12
1986 #define BR_SSL_CERTIFICATE_REQUEST   13
1987 #define BR_SSL_SERVER_HELLO_DONE     14
1988 #define BR_SSL_CERTIFICATE_VERIFY    15
1989 #define BR_SSL_CLIENT_KEY_EXCHANGE   16
1990 #define BR_SSL_FINISHED              20
1991
1992 /*
1993  * Alert levels.
1994  */
1995 #define BR_LEVEL_WARNING   1
1996 #define BR_LEVEL_FATAL     2
1997
1998 /*
1999  * Low-level I/O state.
2000  */
2001 #define BR_IO_FAILED   0
2002 #define BR_IO_IN       1
2003 #define BR_IO_OUT      2
2004 #define BR_IO_INOUT    3
2005
2006 /*
2007  * Mark a SSL engine as failed. The provided error code is recorded if
2008  * the engine was not already marked as failed. If 'err' is 0, then the
2009  * engine is marked as closed (without error).
2010  */
2011 void br_ssl_engine_fail(br_ssl_engine_context *cc, int err);
2012
2013 /*
2014  * Test whether the engine is closed (normally or as a failure).
2015  */
2016 static inline int
2017 br_ssl_engine_closed(const br_ssl_engine_context *cc)
2018 {
2019         return cc->iomode == BR_IO_FAILED;
2020 }
2021
2022 /*
2023  * Configure a new maximum fragment length. If possible, the maximum
2024  * length for outgoing records is immediately adjusted (if there are
2025  * not already too many buffered bytes for that).
2026  */
2027 void br_ssl_engine_new_max_frag_len(
2028         br_ssl_engine_context *rc, unsigned max_frag_len);
2029
2030 /*
2031  * Test whether the current incoming record has been fully received
2032  * or not. This functions returns 0 only if a complete record header
2033  * has been received, but some of the (possibly encrypted) payload
2034  * has not yet been obtained.
2035  */
2036 int br_ssl_engine_recvrec_finished(const br_ssl_engine_context *rc);
2037
2038 /*
2039  * Flush the current record (if not empty). This is meant to be called
2040  * from the handshake processor only.
2041  */
2042 void br_ssl_engine_flush_record(br_ssl_engine_context *cc);
2043
2044 /*
2045  * Test whether there is some accumulated payload to send.
2046  */
2047 static inline int
2048 br_ssl_engine_has_pld_to_send(const br_ssl_engine_context *rc)
2049 {
2050         return rc->oxa != rc->oxb && rc->oxa != rc->oxc;
2051 }
2052
2053 /*
2054  * Initialize RNG in engine. Returned value is 1 on success, 0 on error.
2055  * This function will try to use the OS-provided RNG, if available. If
2056  * there is no OS-provided RNG, or if it failed, and no entropy was
2057  * injected by the caller, then a failure will be reported. On error,
2058  * the context error code is set.
2059  */
2060 int br_ssl_engine_init_rand(br_ssl_engine_context *cc);
2061
2062 /*
2063  * Reset the handshake-related parts of the engine.
2064  */
2065 void br_ssl_engine_hs_reset(br_ssl_engine_context *cc,
2066         void (*hsinit)(void *), void (*hsrun)(void *));
2067
2068 /*
2069  * Get the PRF to use for this context, for the provided PRF hash
2070  * function ID.
2071  */
2072 br_tls_prf_impl br_ssl_engine_get_PRF(br_ssl_engine_context *cc, int prf_id);
2073
2074 /*
2075  * Consume the provided pre-master secret and compute the corresponding
2076  * master secret. The 'prf_id' is the ID of the hash function to use
2077  * with the TLS 1.2 PRF (ignored if the version is TLS 1.0 or 1.1).
2078  */
2079 void br_ssl_engine_compute_master(br_ssl_engine_context *cc,
2080         int prf_id, const void *pms, size_t len);
2081
2082 /*
2083  * Switch to CBC decryption for incoming records.
2084  *    cc               the engine context
2085  *    is_client        non-zero for a client, zero for a server
2086  *    prf_id           id of hash function for PRF (ignored if not TLS 1.2+)
2087  *    mac_id           id of hash function for HMAC
2088  *    bc_impl          block cipher implementation (CBC decryption)
2089  *    cipher_key_len   block cipher key length (in bytes)
2090  */
2091 void br_ssl_engine_switch_cbc_in(br_ssl_engine_context *cc,
2092         int is_client, int prf_id, int mac_id,
2093         const br_block_cbcdec_class *bc_impl, size_t cipher_key_len);
2094
2095 /*
2096  * Switch to CBC encryption for outgoing records.
2097  *    cc               the engine context
2098  *    is_client        non-zero for a client, zero for a server
2099  *    prf_id           id of hash function for PRF (ignored if not TLS 1.2+)
2100  *    mac_id           id of hash function for HMAC
2101  *    bc_impl          block cipher implementation (CBC encryption)
2102  *    cipher_key_len   block cipher key length (in bytes)
2103  */
2104 void br_ssl_engine_switch_cbc_out(br_ssl_engine_context *cc,
2105         int is_client, int prf_id, int mac_id,
2106         const br_block_cbcenc_class *bc_impl, size_t cipher_key_len);
2107
2108 /*
2109  * Switch to GCM decryption for incoming records.
2110  *    cc               the engine context
2111  *    is_client        non-zero for a client, zero for a server
2112  *    prf_id           id of hash function for PRF
2113  *    bc_impl          block cipher implementation (CTR)
2114  *    cipher_key_len   block cipher key length (in bytes)
2115  */
2116 void br_ssl_engine_switch_gcm_in(br_ssl_engine_context *cc,
2117         int is_client, int prf_id,
2118         const br_block_ctr_class *bc_impl, size_t cipher_key_len);
2119
2120 /*
2121  * Switch to GCM encryption for outgoing records.
2122  *    cc               the engine context
2123  *    is_client        non-zero for a client, zero for a server
2124  *    prf_id           id of hash function for PRF
2125  *    bc_impl          block cipher implementation (CTR)
2126  *    cipher_key_len   block cipher key length (in bytes)
2127  */
2128 void br_ssl_engine_switch_gcm_out(br_ssl_engine_context *cc,
2129         int is_client, int prf_id,
2130         const br_block_ctr_class *bc_impl, size_t cipher_key_len);
2131
2132 /*
2133  * Switch to ChaCha20+Poly1305 decryption for incoming records.
2134  *    cc               the engine context
2135  *    is_client        non-zero for a client, zero for a server
2136  *    prf_id           id of hash function for PRF
2137  */
2138 void br_ssl_engine_switch_chapol_in(br_ssl_engine_context *cc,
2139         int is_client, int prf_id);
2140
2141 /*
2142  * Switch to ChaCha20+Poly1305 encryption for outgoing records.
2143  *    cc               the engine context
2144  *    is_client        non-zero for a client, zero for a server
2145  *    prf_id           id of hash function for PRF
2146  */
2147 void br_ssl_engine_switch_chapol_out(br_ssl_engine_context *cc,
2148         int is_client, int prf_id);
2149
2150 /*
2151  * Calls to T0-generated code.
2152  */
2153 void br_ssl_hs_client_init_main(void *ctx);
2154 void br_ssl_hs_client_run(void *ctx);
2155 void br_ssl_hs_server_init_main(void *ctx);
2156 void br_ssl_hs_server_run(void *ctx);
2157
2158 /*
2159  * Get the hash function to use for signatures, given a bit mask of
2160  * supported hash functions. This implements a strict choice order
2161  * (namely SHA-256, SHA-384, SHA-512, SHA-224, SHA-1). If the mask
2162  * does not document support of any of these hash functions, then this
2163  * functions returns 0.
2164  */
2165 int br_ssl_choose_hash(unsigned bf);
2166
2167 /* ==================================================================== */
2168
2169 /*
2170  * PowerPC / POWER assembly stuff. The special BR_POWER_ASM_MACROS macro
2171  * must be defined before including this file; this is done by source
2172  * files that use some inline assembly for PowerPC / POWER machines.
2173  */
2174
2175 #if BR_POWER_ASM_MACROS
2176
2177 #define lxvw4x(xt, ra, rb)        lxvw4x_(xt, ra, rb)
2178 #define stxvw4x(xt, ra, rb)       stxvw4x_(xt, ra, rb)
2179
2180 #define bdnz(foo)                 bdnz_(foo)
2181 #define beq(foo)                  beq_(foo)
2182
2183 #define li(rx, value)             li_(rx, value)
2184 #define addi(rx, ra, imm)         addi_(rx, ra, imm)
2185 #define cmpldi(rx, imm)           cmpldi_(rx, imm)
2186 #define mtctr(rx)                 mtctr_(rx)
2187 #define vspltb(vrt, vrb, uim)     vspltb_(vrt, vrb, uim)
2188 #define vspltw(vrt, vrb, uim)     vspltw_(vrt, vrb, uim)
2189 #define vspltisb(vrt, imm)        vspltisb_(vrt, imm)
2190 #define vspltisw(vrt, imm)        vspltisw_(vrt, imm)
2191 #define vrlw(vrt, vra, vrb)       vrlw_(vrt, vra, vrb)
2192 #define vsbox(vrt, vra)           vsbox_(vrt, vra)
2193 #define vxor(vrt, vra, vrb)       vxor_(vrt, vra, vrb)
2194 #define vand(vrt, vra, vrb)       vand_(vrt, vra, vrb)
2195 #define vsro(vrt, vra, vrb)       vsro_(vrt, vra, vrb)
2196 #define vsl(vrt, vra, vrb)        vsl_(vrt, vra, vrb)
2197 #define vsldoi(vt, va, vb, sh)    vsldoi_(vt, va, vb, sh)
2198 #define vsr(vrt, vra, vrb)        vsr_(vrt, vra, vrb)
2199 #define vadduwm(vrt, vra, vrb)    vadduwm_(vrt, vra, vrb)
2200 #define vsububm(vrt, vra, vrb)    vsububm_(vrt, vra, vrb)
2201 #define vsubuwm(vrt, vra, vrb)    vsubuwm_(vrt, vra, vrb)
2202 #define vsrw(vrt, vra, vrb)       vsrw_(vrt, vra, vrb)
2203 #define vcipher(vt, va, vb)       vcipher_(vt, va, vb)
2204 #define vcipherlast(vt, va, vb)   vcipherlast_(vt, va, vb)
2205 #define vncipher(vt, va, vb)      vncipher_(vt, va, vb)
2206 #define vncipherlast(vt, va, vb)  vncipherlast_(vt, va, vb)
2207 #define vperm(vt, va, vb, vc)     vperm_(vt, va, vb, vc)
2208 #define vpmsumd(vt, va, vb)       vpmsumd_(vt, va, vb)
2209 #define xxpermdi(vt, va, vb, d)   xxpermdi_(vt, va, vb, d)
2210
2211 #define lxvw4x_(xt, ra, rb)       "\tlxvw4x\t" #xt "," #ra "," #rb "\n"
2212 #define stxvw4x_(xt, ra, rb)      "\tstxvw4x\t" #xt "," #ra "," #rb "\n"
2213
2214 #define label(foo)                #foo "%=:\n"
2215 #define bdnz_(foo)                "\tbdnz\t" #foo "%=\n"
2216 #define beq_(foo)                 "\tbeq\t" #foo "%=\n"
2217
2218 #define li_(rx, value)            "\tli\t" #rx "," #value "\n"
2219 #define addi_(rx, ra, imm)        "\taddi\t" #rx "," #ra "," #imm "\n"
2220 #define cmpldi_(rx, imm)          "\tcmpldi\t" #rx "," #imm "\n"
2221 #define mtctr_(rx)                "\tmtctr\t" #rx "\n"
2222 #define vspltb_(vrt, vrb, uim)    "\tvspltb\t" #vrt "," #vrb "," #uim "\n"
2223 #define vspltw_(vrt, vrb, uim)    "\tvspltw\t" #vrt "," #vrb "," #uim "\n"
2224 #define vspltisb_(vrt, imm)       "\tvspltisb\t" #vrt "," #imm "\n"
2225 #define vspltisw_(vrt, imm)       "\tvspltisw\t" #vrt "," #imm "\n"
2226 #define vrlw_(vrt, vra, vrb)      "\tvrlw\t" #vrt "," #vra "," #vrb "\n"
2227 #define vsbox_(vrt, vra)          "\tvsbox\t" #vrt "," #vra "\n"
2228 #define vxor_(vrt, vra, vrb)      "\tvxor\t" #vrt "," #vra "," #vrb "\n"
2229 #define vand_(vrt, vra, vrb)      "\tvand\t" #vrt "," #vra "," #vrb "\n"
2230 #define vsro_(vrt, vra, vrb)      "\tvsro\t" #vrt "," #vra "," #vrb "\n"
2231 #define vsl_(vrt, vra, vrb)       "\tvsl\t" #vrt "," #vra "," #vrb "\n"
2232 #define vsldoi_(vt, va, vb, sh)   "\tvsldoi\t" #vt "," #va "," #vb "," #sh "\n"
2233 #define vsr_(vrt, vra, vrb)       "\tvsr\t" #vrt "," #vra "," #vrb "\n"
2234 #define vadduwm_(vrt, vra, vrb)   "\tvadduwm\t" #vrt "," #vra "," #vrb "\n"
2235 #define vsububm_(vrt, vra, vrb)   "\tvsububm\t" #vrt "," #vra "," #vrb "\n"
2236 #define vsubuwm_(vrt, vra, vrb)   "\tvsubuwm\t" #vrt "," #vra "," #vrb "\n"
2237 #define vsrw_(vrt, vra, vrb)      "\tvsrw\t" #vrt "," #vra "," #vrb "\n"
2238 #define vcipher_(vt, va, vb)      "\tvcipher\t" #vt "," #va "," #vb "\n"
2239 #define vcipherlast_(vt, va, vb)  "\tvcipherlast\t" #vt "," #va "," #vb "\n"
2240 #define vncipher_(vt, va, vb)     "\tvncipher\t" #vt "," #va "," #vb "\n"
2241 #define vncipherlast_(vt, va, vb) "\tvncipherlast\t" #vt "," #va "," #vb "\n"
2242 #define vperm_(vt, va, vb, vc)    "\tvperm\t" #vt "," #va "," #vb "," #vc "\n"
2243 #define vpmsumd_(vt, va, vb)      "\tvpmsumd\t" #vt "," #va "," #vb "\n"
2244 #define xxpermdi_(vt, va, vb, d)  "\txxpermdi\t" #vt "," #va "," #vb "," #d "\n"
2245
2246 #endif
2247
2248 /* ==================================================================== */
2249 /*
2250  * Special "activate intrinsics" code, needed for some compiler versions.
2251  * This is defined at the end of this file, so that it won't impact any
2252  * of the inline functions defined previously; and it is controlled by
2253  * a specific macro defined in the caller code.
2254  *
2255  * Calling code conventions:
2256  *
2257  *  - Caller must define BR_ENABLE_INTRINSICS before including "inner.h".
2258  *  - Functions that use intrinsics must be enclosed in an "enabled"
2259  *    region (between BR_TARGETS_X86_UP and BR_TARGETS_X86_DOWN).
2260  *  - Functions that use intrinsics must be tagged with the appropriate
2261  *    BR_TARGET().
2262  */
2263
2264 #if BR_ENABLE_INTRINSICS && (BR_GCC_4_4 || BR_CLANG_3_7 || BR_MSC_2005)
2265
2266 /*
2267  * x86 intrinsics (both 32-bit and 64-bit).
2268  */
2269 #if BR_i386 || BR_amd64
2270
2271 /*
2272  * On GCC before version 5.0, we need to use the pragma to enable the
2273  * target options globally, because the 'target' function attribute
2274  * appears to be unreliable. Before 4.6 we must also avoid the
2275  * push_options / pop_options mechanism, because it tends to trigger
2276  * some internal compiler errors.
2277  */
2278 #if BR_GCC && !BR_GCC_5_0
2279 #if BR_GCC_4_6
2280 #define BR_TARGETS_X86_UP \
2281         _Pragma("GCC push_options") \
2282         _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul,rdrnd\")")
2283 #define BR_TARGETS_X86_DOWN \
2284         _Pragma("GCC pop_options")
2285 #else
2286 #define BR_TARGETS_X86_UP \
2287         _Pragma("GCC target(\"sse2,ssse3,sse4.1,aes,pclmul\")")
2288 #endif
2289 #define BR_TARGETS_X86_DOWN
2290 #pragma GCC diagnostic ignored "-Wpsabi"
2291 #endif
2292
2293 #if BR_CLANG && !BR_CLANG_3_8
2294 #undef __SSE2__
2295 #undef __SSE3__
2296 #undef __SSSE3__
2297 #undef __SSE4_1__
2298 #undef __AES__
2299 #undef __PCLMUL__
2300 #undef __RDRND__
2301 #define __SSE2__     1
2302 #define __SSE3__     1
2303 #define __SSSE3__    1
2304 #define __SSE4_1__   1
2305 #define __AES__      1
2306 #define __PCLMUL__   1
2307 #define __RDRND__    1
2308 #endif
2309
2310 #ifndef BR_TARGETS_X86_UP
2311 #define BR_TARGETS_X86_UP
2312 #endif
2313 #ifndef BR_TARGETS_X86_DOWN
2314 #define BR_TARGETS_X86_DOWN
2315 #endif
2316
2317 #if BR_GCC || BR_CLANG
2318 BR_TARGETS_X86_UP
2319 #include <x86intrin.h>
2320 #include <cpuid.h>
2321 #define br_bswap32   __builtin_bswap32
2322 BR_TARGETS_X86_DOWN
2323 #endif
2324
2325 #if BR_MSC
2326 #include <stdlib.h>
2327 #include <intrin.h>
2328 #include <immintrin.h>
2329 #define br_bswap32   _byteswap_ulong
2330 #endif
2331
2332 static inline int
2333 br_cpuid(uint32_t mask_eax, uint32_t mask_ebx,
2334         uint32_t mask_ecx, uint32_t mask_edx)
2335 {
2336 #if BR_GCC || BR_CLANG
2337         unsigned eax, ebx, ecx, edx;
2338
2339         if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
2340                 if ((eax & mask_eax) == mask_eax
2341                         && (ebx & mask_ebx) == mask_ebx
2342                         && (ecx & mask_ecx) == mask_ecx
2343                         && (edx & mask_edx) == mask_edx)
2344                 {
2345                         return 1;
2346                 }
2347         }
2348 #elif BR_MSC
2349         int info[4];
2350
2351         __cpuid(info, 1);
2352         if (((uint32_t)info[0] & mask_eax) == mask_eax
2353                 && ((uint32_t)info[1] & mask_ebx) == mask_ebx
2354                 && ((uint32_t)info[2] & mask_ecx) == mask_ecx
2355                 && ((uint32_t)info[3] & mask_edx) == mask_edx)
2356         {
2357                 return 1;
2358         }
2359 #endif
2360         return 0;
2361 }
2362
2363 #endif
2364
2365 #endif
2366
2367 /* ==================================================================== */
2368
2369 #endif