_ Git - BearSSL/blob - chacha20_sse2.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "inner.h"
  26
  27 /*
  28  * This file contains a ChaCha20 implementation that leverages SSE2
  29  * opcodes for better performance.
  30  */
  31
  32 #if BR_SSE2
  33
  34 #if BR_SSE2_GCC
  35 #include <emmintrin.h>
  36 #include <cpuid.h>
  37 #endif
  38 #if BR_SSE2_MSC
  39 #include <intrin.h>
  40 #endif
  41
  42 /* see bearssl_block.h */
  43 BR_TARGET("sse2")
  44 uint32_t
  45 br_chacha20_sse2_run(const void *key,
  46         const void *iv, uint32_t cc, void *data, size_t len)
  47 {
  48         unsigned char *buf;
  49         uint32_t ivtmp[4];
  50         __m128i kw0, kw1;
  51         __m128i iw, cw;
  52         __m128i one;
  53
  54         static const uint32_t CW[] = {
  55                 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
  56         };
  57
  58         buf = data;
  59         kw0 = _mm_loadu_si128(key);
  60         kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
  61         ivtmp[0] = cc;
  62         memcpy(ivtmp + 1, iv, 12);
  63         iw = _mm_loadu_si128((const void *)ivtmp);
  64         cw = _mm_loadu_si128((const void *)CW);
  65         one = _mm_set_epi32(0, 0, 0, 1);
  66
  67         while (len > 0) {
  68                 /*
  69                  * sj contains state words 4*j to 4*j+3.
  70                  */
  71                 __m128i s0, s1, s2, s3;
  72                 int i;
  73
  74                 s0 = cw;
  75                 s1 = kw0;
  76                 s2 = kw1;
  77                 s3 = iw;
  78                 for (i = 0; i < 10; i ++) {
  79                         /*
  80                          * Even round is straightforward application on
  81                          * the state words.
  82                          */
  83                         s0 = _mm_add_epi32(s0, s1);
  84                         s3 = _mm_xor_si128(s3, s0);
  85                         s3 = _mm_or_si128(
  86                                 _mm_slli_epi32(s3, 16),
  87                                 _mm_srli_epi32(s3, 16));
  88
  89                         s2 = _mm_add_epi32(s2, s3);
  90                         s1 = _mm_xor_si128(s1, s2);
  91                         s1 = _mm_or_si128(
  92                                 _mm_slli_epi32(s1, 12),
  93                                 _mm_srli_epi32(s1, 20));
  94
  95                         s0 = _mm_add_epi32(s0, s1);
  96                         s3 = _mm_xor_si128(s3, s0);
  97                         s3 = _mm_or_si128(
  98                                 _mm_slli_epi32(s3, 8),
  99                                 _mm_srli_epi32(s3, 24));
 100
 101                         s2 = _mm_add_epi32(s2, s3);
 102                         s1 = _mm_xor_si128(s1, s2);
 103                         s1 = _mm_or_si128(
 104                                 _mm_slli_epi32(s1, 7),
 105                                 _mm_srli_epi32(s1, 25));
 106
 107                         /*
 108                          * For the odd round, we must rotate some state
 109                          * words so that the computations apply on the
 110                          * right combinations of words.
 111                          */
 112                         s1 = _mm_shuffle_epi32(s1, 0x39);
 113                         s2 = _mm_shuffle_epi32(s2, 0x4E);
 114                         s3 = _mm_shuffle_epi32(s3, 0x93);
 115
 116                         s0 = _mm_add_epi32(s0, s1);
 117                         s3 = _mm_xor_si128(s3, s0);
 118                         s3 = _mm_or_si128(
 119                                 _mm_slli_epi32(s3, 16),
 120                                 _mm_srli_epi32(s3, 16));
 121
 122                         s2 = _mm_add_epi32(s2, s3);
 123                         s1 = _mm_xor_si128(s1, s2);
 124                         s1 = _mm_or_si128(
 125                                 _mm_slli_epi32(s1, 12),
 126                                 _mm_srli_epi32(s1, 20));
 127
 128                         s0 = _mm_add_epi32(s0, s1);
 129                         s3 = _mm_xor_si128(s3, s0);
 130                         s3 = _mm_or_si128(
 131                                 _mm_slli_epi32(s3, 8),
 132                                 _mm_srli_epi32(s3, 24));
 133
 134                         s2 = _mm_add_epi32(s2, s3);
 135                         s1 = _mm_xor_si128(s1, s2);
 136                         s1 = _mm_or_si128(
 137                                 _mm_slli_epi32(s1, 7),
 138                                 _mm_srli_epi32(s1, 25));
 139
 140                         /*
 141                          * After the odd round, we rotate back the values
 142                          * to undo the rotate at the start of the odd round.
 143                          */
 144                         s1 = _mm_shuffle_epi32(s1, 0x93);
 145                         s2 = _mm_shuffle_epi32(s2, 0x4E);
 146                         s3 = _mm_shuffle_epi32(s3, 0x39);
 147                 }
 148
 149                 /*
 150                  * Addition with the initial state.
 151                  */
 152                 s0 = _mm_add_epi32(s0, cw);
 153                 s1 = _mm_add_epi32(s1, kw0);
 154                 s2 = _mm_add_epi32(s2, kw1);
 155                 s3 = _mm_add_epi32(s3, iw);
 156
 157                 /*
 158                  * Increment block counter.
 159                  */
 160                 iw = _mm_add_epi32(iw, one);
 161
 162                 /*
 163                  * XOR final state with the data.
 164                  */
 165                 if (len < 64) {
 166                         unsigned char tmp[64];
 167                         size_t u;
 168
 169                         _mm_storeu_si128((void *)(tmp +  0), s0);
 170                         _mm_storeu_si128((void *)(tmp + 16), s1);
 171                         _mm_storeu_si128((void *)(tmp + 32), s2);
 172                         _mm_storeu_si128((void *)(tmp + 48), s3);
 173                         for (u = 0; u < len; u ++) {
 174                                 buf[u] ^= tmp[u];
 175                         }
 176                         break;
 177                 } else {
 178                         __m128i b0, b1, b2, b3;
 179
 180                         b0 = _mm_loadu_si128((const void *)(buf +  0));
 181                         b1 = _mm_loadu_si128((const void *)(buf + 16));
 182                         b2 = _mm_loadu_si128((const void *)(buf + 32));
 183                         b3 = _mm_loadu_si128((const void *)(buf + 48));
 184                         b0 = _mm_xor_si128(b0, s0);
 185                         b1 = _mm_xor_si128(b1, s1);
 186                         b2 = _mm_xor_si128(b2, s2);
 187                         b3 = _mm_xor_si128(b3, s3);
 188                         _mm_storeu_si128((void *)(buf +  0), b0);
 189                         _mm_storeu_si128((void *)(buf + 16), b1);
 190                         _mm_storeu_si128((void *)(buf + 32), b2);
 191                         _mm_storeu_si128((void *)(buf + 48), b3);
 192                         buf += 64;
 193                         len -= 64;
 194                 }
 195         }
 196
 197         /*
 198          * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
 199          * raw SSE2, thus we use _mm_extract_epi16().
 200          */
 201         return (uint32_t)_mm_extract_epi16(iw, 0)
 202                 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
 203 }
 204
 205 /* see bearssl_block.h */
 206 br_chacha20_run
 207 br_chacha20_sse2_get(void)
 208 {
 209         /*
 210          * If using 64-bit mode, then SSE2 opcodes should be automatically
 211          * available, since they are part of the ABI.
 212          *
 213          * In 32-bit mode, we use CPUID to detect the SSE2 feature.
 214          */
 215
 216 #if __x86_64__ || _M_X64
 217
 218         return &br_chacha20_sse2_run;
 219
 220 #else
 221
 222         /*
 223          * SSE2 support is indicated by bit 26 in EDX.
 224          */
 225 #define MASK   0x04000000
 226
 227 #if BR_SSE2_GCC
 228         unsigned eax, ebx, ecx, edx;
 229
 230         if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
 231                 if ((edx & MASK) == MASK) {
 232                         return &br_chacha20_sse2_run;
 233                 }
 234         }
 235 #elif BR_SSE2_MSC
 236         int info[4];
 237
 238         __cpuid(info, 1);
 239         if (((uint32_t)info[3] & MASK) == MASK) {
 240                 return &br_chacha20_sse2_run;
 241         }
 242 #endif
 243         return 0;
 244
 245 #endif
 246 }
 247
 248 #else
 249
 250 /* see bearssl_block.h */
 251 br_chacha20_run
 252 br_chacha20_sse2_get(void)
 253 {
 254         return 0;
 255 }
 256
 257 #endif