_ Git - BearSSL/blob - src/hash/ghash_pclmul.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * This is the GHASH implementation that leverages the pclmulqdq opcode
  29  * (from the AES-NI instructions).
  30  */
  31
  32 #if BR_AES_X86NI
  33
  34 #if BR_AES_X86NI_GCC
  35 #if BR_AES_X86NI_GCC_OLD
  36 #pragma GCC push_options
  37 #pragma GCC target("sse2,ssse3,pclmul")
  38 #pragma GCC diagnostic ignored "-Wpsabi"
  39 #endif
  40 #include <tmmintrin.h>
  41 #include <wmmintrin.h>
  42 #include <cpuid.h>
  43 #endif
  44
  45 #if BR_AES_X86NI_MSC
  46 #include <intrin.h>
  47 #endif
  48
  49 /*
  50  * GHASH is defined over elements of GF(2^128) with "full little-endian"
  51  * representation: leftmost byte is least significant, and, within each
  52  * byte, leftmost _bit_ is least significant. The natural ordering in
  53  * x86 is "mixed little-endian": bytes are ordered from least to most
  54  * significant, but bits within a byte are in most-to-least significant
  55  * order. Going to full little-endian representation would require
  56  * reversing bits within each byte, which is doable but expensive.
  57  *
  58  * Instead, we go to full big-endian representation, by swapping bytes
  59  * around, which is done with a single _mm_shuffle_epi8() opcode (it
  60  * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
  61  * can use a full big-endian representation because in a carryless
  62  * multiplication, we have a nice bit reversal property:
  63  *
  64  *    rev_128(x) * rev_128(y) = rev_255(x * y)
  65  *
  66  * So by using full big-endian, we still get the right result, except
  67  * that it is right-shifted by 1 bit. The left-shift is relatively
  68  * inexpensive, and it can be mutualised.
  69  *
  70  *
  71  * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
  72  * values with bit precision, we have to break down values into 64-bit
  73  * chunks. We number chunks from 0 to 3 in left to right order.
  74  */
  75
  76 /*
  77  * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
  78  * halves of kw (into the right half of kx; left half is unspecified).
  79  */
  80 #define BK(kw, kx)   do { \
  81                 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
  82         } while (0)
  83
  84 /*
  85  * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
  86  * the XOR of the two values (kx).
  87  */
  88 #define PBK(k0, k1, kw, kx)   do { \
  89                 kw = _mm_unpacklo_epi64(k1, k0); \
  90                 kx = _mm_xor_si128(k0, k1); \
  91         } while (0)
  92
  93 /*
  94  * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
  95  */
  96 #define SL_256(x0, x1, x2, x3)   do { \
  97                 x0 = _mm_or_si128( \
  98                         _mm_slli_epi64(x0, 1), \
  99                         _mm_srli_epi64(x1, 63)); \
 100                 x1 = _mm_or_si128( \
 101                         _mm_slli_epi64(x1, 1), \
 102                         _mm_srli_epi64(x2, 63)); \
 103                 x2 = _mm_or_si128( \
 104                         _mm_slli_epi64(x2, 1), \
 105                         _mm_srli_epi64(x3, 63)); \
 106                 x3 = _mm_slli_epi64(x3, 1); \
 107         } while (0)
 108
 109 /*
 110  * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
 111  * result is written in x0..x1.
 112  */
 113 #define REDUCE_F128(x0, x1, x2, x3)   do { \
 114                 x1 = _mm_xor_si128( \
 115                         x1, \
 116                         _mm_xor_si128( \
 117                                 _mm_xor_si128( \
 118                                         x3, \
 119                                         _mm_srli_epi64(x3, 1)), \
 120                                 _mm_xor_si128( \
 121                                         _mm_srli_epi64(x3, 2), \
 122                                         _mm_srli_epi64(x3, 7)))); \
 123                 x2 = _mm_xor_si128( \
 124                         _mm_xor_si128( \
 125                                 x2, \
 126                                 _mm_slli_epi64(x3, 63)), \
 127                         _mm_xor_si128( \
 128                                 _mm_slli_epi64(x3, 62), \
 129                                 _mm_slli_epi64(x3, 57))); \
 130                 x0 = _mm_xor_si128( \
 131                         x0, \
 132                         _mm_xor_si128( \
 133                                 _mm_xor_si128( \
 134                                         x2, \
 135                                         _mm_srli_epi64(x2, 1)), \
 136                                 _mm_xor_si128( \
 137                                         _mm_srli_epi64(x2, 2), \
 138                                         _mm_srli_epi64(x2, 7)))); \
 139                 x1 = _mm_xor_si128( \
 140                         _mm_xor_si128( \
 141                                 x1, \
 142                                 _mm_slli_epi64(x2, 63)), \
 143                         _mm_xor_si128( \
 144                                 _mm_slli_epi64(x2, 62), \
 145                                 _mm_slli_epi64(x2, 57))); \
 146         } while (0)
 147
 148 /*
 149  * Square value kw into (dw,dx).
 150  */
 151 #define SQUARE_F128(kw, dw, dx)   do { \
 152                 __m128i z0, z1, z2, z3; \
 153                 z1 = _mm_clmulepi64_si128(kw, kw, 0x11); \
 154                 z3 = _mm_clmulepi64_si128(kw, kw, 0x00); \
 155                 z0 = _mm_shuffle_epi32(z1, 0x0E); \
 156                 z2 = _mm_shuffle_epi32(z3, 0x0E); \
 157                 SL_256(z0, z1, z2, z3); \
 158                 REDUCE_F128(z0, z1, z2, z3); \
 159                 PBK(z0, z1, dw, dx); \
 160         } while (0)
 161
 162 /* see bearssl_hash.h */
 163 BR_TARGET("ssse3,pclmul")
 164 void
 165 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
 166 {
 167         const unsigned char *buf1, *buf2;
 168         unsigned char tmp[64];
 169         size_t num4, num1;
 170         __m128i yw, h1w, h1x;
 171         __m128i byteswap_index;
 172
 173         /*
 174          * We split data into two chunks. First chunk starts at buf1
 175          * and contains num4 blocks of 64-byte values. Second chunk
 176          * starts at buf2 and contains num1 blocks of 16-byte values.
 177          * We want the first chunk to be as large as possible.
 178          */
 179         buf1 = data;
 180         num4 = len >> 6;
 181         len &= 63;
 182         buf2 = buf1 + (num4 << 6);
 183         num1 = (len + 15) >> 4;
 184         if ((len & 15) != 0) {
 185                 memcpy(tmp, buf2, len);
 186                 memset(tmp + len, 0, (num1 << 4) - len);
 187                 buf2 = tmp;
 188         }
 189
 190         /*
 191          * Constant value to perform endian conversion.
 192          */
 193         byteswap_index = _mm_set_epi8(
 194                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 195
 196         /*
 197          * Load y and h.
 198          */
 199         yw = _mm_loadu_si128(y);
 200         h1w = _mm_loadu_si128(h);
 201         yw = _mm_shuffle_epi8(yw, byteswap_index);
 202         h1w = _mm_shuffle_epi8(h1w, byteswap_index);
 203         BK(h1w, h1x);
 204
 205         if (num4 > 0) {
 206                 __m128i h2w, h2x, h3w, h3x, h4w, h4x;
 207                 __m128i t0, t1, t2, t3;
 208
 209                 /*
 210                  * Compute h2 = h^2.
 211                  */
 212                 SQUARE_F128(h1w, h2w, h2x);
 213
 214                 /*
 215                  * Compute h3 = h^3 = h*(h^2).
 216                  */
 217                 t1 = _mm_clmulepi64_si128(h1w, h2w, 0x11);
 218                 t3 = _mm_clmulepi64_si128(h1w, h2w, 0x00);
 219                 t2 = _mm_xor_si128(_mm_clmulepi64_si128(h1x, h2x, 0x00),
 220                         _mm_xor_si128(t1, t3));
 221                 t0 = _mm_shuffle_epi32(t1, 0x0E);
 222                 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 223                 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 224                 SL_256(t0, t1, t2, t3);
 225                 REDUCE_F128(t0, t1, t2, t3);
 226                 PBK(t0, t1, h3w, h3x);
 227
 228                 /*
 229                  * Compute h4 = h^4 = (h^2)^2.
 230                  */
 231                 SQUARE_F128(h2w, h4w, h4x);
 232
 233                 while (num4 -- > 0) {
 234                         __m128i aw0, aw1, aw2, aw3;
 235                         __m128i ax0, ax1, ax2, ax3;
 236
 237                         aw0 = _mm_loadu_si128((void *)(buf1 +  0));
 238                         aw1 = _mm_loadu_si128((void *)(buf1 + 16));
 239                         aw2 = _mm_loadu_si128((void *)(buf1 + 32));
 240                         aw3 = _mm_loadu_si128((void *)(buf1 + 48));
 241                         aw0 = _mm_shuffle_epi8(aw0, byteswap_index);
 242                         aw1 = _mm_shuffle_epi8(aw1, byteswap_index);
 243                         aw2 = _mm_shuffle_epi8(aw2, byteswap_index);
 244                         aw3 = _mm_shuffle_epi8(aw3, byteswap_index);
 245                         buf1 += 64;
 246
 247                         aw0 = _mm_xor_si128(aw0, yw);
 248                         BK(aw1, ax1);
 249                         BK(aw2, ax2);
 250                         BK(aw3, ax3);
 251                         BK(aw0, ax0);
 252
 253                         t1 = _mm_xor_si128(
 254                                 _mm_xor_si128(
 255                                         _mm_clmulepi64_si128(aw0, h4w, 0x11),
 256                                         _mm_clmulepi64_si128(aw1, h3w, 0x11)),
 257                                 _mm_xor_si128(
 258                                         _mm_clmulepi64_si128(aw2, h2w, 0x11),
 259                                         _mm_clmulepi64_si128(aw3, h1w, 0x11)));
 260                         t3 = _mm_xor_si128(
 261                                 _mm_xor_si128(
 262                                         _mm_clmulepi64_si128(aw0, h4w, 0x00),
 263                                         _mm_clmulepi64_si128(aw1, h3w, 0x00)),
 264                                 _mm_xor_si128(
 265                                         _mm_clmulepi64_si128(aw2, h2w, 0x00),
 266                                         _mm_clmulepi64_si128(aw3, h1w, 0x00)));
 267                         t2 = _mm_xor_si128(
 268                                 _mm_xor_si128(
 269                                         _mm_clmulepi64_si128(ax0, h4x, 0x00),
 270                                         _mm_clmulepi64_si128(ax1, h3x, 0x00)),
 271                                 _mm_xor_si128(
 272                                         _mm_clmulepi64_si128(ax2, h2x, 0x00),
 273                                         _mm_clmulepi64_si128(ax3, h1x, 0x00)));
 274                         t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
 275                         t0 = _mm_shuffle_epi32(t1, 0x0E);
 276                         t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 277                         t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 278                         SL_256(t0, t1, t2, t3);
 279                         REDUCE_F128(t0, t1, t2, t3);
 280                         yw = _mm_unpacklo_epi64(t1, t0);
 281                 }
 282         }
 283
 284         while (num1 -- > 0) {
 285                 __m128i aw, ax;
 286                 __m128i t0, t1, t2, t3;
 287
 288                 aw = _mm_loadu_si128((void *)buf2);
 289                 aw = _mm_shuffle_epi8(aw, byteswap_index);
 290                 buf2 += 16;
 291
 292                 aw = _mm_xor_si128(aw, yw);
 293                 BK(aw, ax);
 294
 295                 t1 = _mm_clmulepi64_si128(aw, h1w, 0x11);
 296                 t3 = _mm_clmulepi64_si128(aw, h1w, 0x00);
 297                 t2 = _mm_clmulepi64_si128(ax, h1x, 0x00);
 298                 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
 299                 t0 = _mm_shuffle_epi32(t1, 0x0E);
 300                 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
 301                 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
 302                 SL_256(t0, t1, t2, t3);
 303                 REDUCE_F128(t0, t1, t2, t3);
 304                 yw = _mm_unpacklo_epi64(t1, t0);
 305         }
 306
 307         yw = _mm_shuffle_epi8(yw, byteswap_index);
 308         _mm_storeu_si128(y, yw);
 309 }
 310
 311 /*
 312  * Test CPU support for PCLMULQDQ.
 313  */
 314 static int
 315 pclmul_supported(void)
 316 {
 317         /*
 318          * Bit mask for features in ECX:
 319          *    1   PCLMULQDQ support
 320          */
 321 #define MASK   0x00000002
 322
 323 #if BR_AES_X86NI_GCC
 324         unsigned eax, ebx, ecx, edx;
 325
 326         if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
 327                 return (ecx & MASK) == MASK;
 328         } else {
 329                 return 0;
 330         }
 331 #elif BR_AES_X86NI_MSC
 332         int info[4];
 333
 334         __cpuid(info, 1);
 335         return ((uint32_t)info[2] & MASK) == MASK;
 336 #else
 337         return 0;
 338 #endif
 339
 340 #undef MASK
 341 }
 342
 343 /* see bearssl_hash.h */
 344 br_ghash
 345 br_ghash_pclmul_get(void)
 346 {
 347         return pclmul_supported() ? &br_ghash_pclmul : 0;
 348 }
 349
 350 #if BR_AES_X86NI_GCC && BR_AES_X86NI_GCC_OLD
 351 #pragma GCC pop_options
 352 #endif
 353
 354 #else
 355
 356 /* see bearssl_hash.h */
 357 br_ghash
 358 br_ghash_pclmul_get(void)
 359 {
 360         return 0;
 361 }
 362
 363 #endif