X-Git-Url: https://www.bearssl.org/gitweb//home/git/?p=BearSSL;a=blobdiff_plain;f=src%2Fsymcipher%2Fchacha20_sse2.c;fp=src%2Fsymcipher%2Fchacha20_sse2.c;h=0b32d51a19908420d707d52520d9725c6732a62b;hp=0000000000000000000000000000000000000000;hb=24c6f09bf83015e04e16666e8a5fb66e75967e0d;hpb=5414fd525e903fe65e372735f106dcdc19109aad diff --git a/src/symcipher/chacha20_sse2.c b/src/symcipher/chacha20_sse2.c new file mode 100644 index 0000000..0b32d51 --- /dev/null +++ b/src/symcipher/chacha20_sse2.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * This file contains a ChaCha20 implementation that leverages SSE2 + * opcodes for better performance. + */ + +#if BR_SSE2 + +#if BR_SSE2_GCC +#include +#include +#endif +#if BR_SSE2_MSC +#include +#endif + +/* see bearssl_block.h */ +BR_TARGET("sse2") +uint32_t +br_chacha20_sse2_run(const void *key, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t ivtmp[4]; + __m128i kw0, kw1; + __m128i iw, cw; + __m128i one; + + static const uint32_t CW[] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }; + + buf = data; + kw0 = _mm_loadu_si128(key); + kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); + ivtmp[0] = cc; + memcpy(ivtmp + 1, iv, 12); + iw = _mm_loadu_si128((const void *)ivtmp); + cw = _mm_loadu_si128((const void *)CW); + one = _mm_set_epi32(0, 0, 0, 1); + + while (len > 0) { + /* + * sj contains state words 4*j to 4*j+3. + */ + __m128i s0, s1, s2, s3; + int i; + + s0 = cw; + s1 = kw0; + s2 = kw1; + s3 = iw; + for (i = 0; i < 10; i ++) { + /* + * Even round is straightforward application on + * the state words. + */ + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * For the odd round, we must rotate some state + * words so that the computations apply on the + * right combinations of words. + */ + s1 = _mm_shuffle_epi32(s1, 0x39); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x93); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * After the odd round, we rotate back the values + * to undo the rotate at the start of the odd round. + */ + s1 = _mm_shuffle_epi32(s1, 0x93); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x39); + } + + /* + * Addition with the initial state. + */ + s0 = _mm_add_epi32(s0, cw); + s1 = _mm_add_epi32(s1, kw0); + s2 = _mm_add_epi32(s2, kw1); + s3 = _mm_add_epi32(s3, iw); + + /* + * Increment block counter. + */ + iw = _mm_add_epi32(iw, one); + + /* + * XOR final state with the data. + */ + if (len < 64) { + unsigned char tmp[64]; + size_t u; + + _mm_storeu_si128((void *)(tmp + 0), s0); + _mm_storeu_si128((void *)(tmp + 16), s1); + _mm_storeu_si128((void *)(tmp + 32), s2); + _mm_storeu_si128((void *)(tmp + 48), s3); + for (u = 0; u < len; u ++) { + buf[u] ^= tmp[u]; + } + break; + } else { + __m128i b0, b1, b2, b3; + + b0 = _mm_loadu_si128((const void *)(buf + 0)); + b1 = _mm_loadu_si128((const void *)(buf + 16)); + b2 = _mm_loadu_si128((const void *)(buf + 32)); + b3 = _mm_loadu_si128((const void *)(buf + 48)); + b0 = _mm_xor_si128(b0, s0); + b1 = _mm_xor_si128(b1, s1); + b2 = _mm_xor_si128(b2, s2); + b3 = _mm_xor_si128(b3, s3); + _mm_storeu_si128((void *)(buf + 0), b0); + _mm_storeu_si128((void *)(buf + 16), b1); + _mm_storeu_si128((void *)(buf + 32), b2); + _mm_storeu_si128((void *)(buf + 48), b3); + buf += 64; + len -= 64; + } + } + + /* + * _mm_extract_epi32() requires SSE4.1. We prefer to stick to + * raw SSE2, thus we use _mm_extract_epi16(). + */ + return (uint32_t)_mm_extract_epi16(iw, 0) + | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); +} + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + /* + * If using 64-bit mode, then SSE2 opcodes should be automatically + * available, since they are part of the ABI. + * + * In 32-bit mode, we use CPUID to detect the SSE2 feature. + */ + +#if __x86_64__ || _M_X64 + + return &br_chacha20_sse2_run; + +#else + + /* + * SSE2 support is indicated by bit 26 in EDX. + */ +#define MASK 0x04000000 + +#if BR_SSE2_GCC + unsigned eax, ebx, ecx, edx; + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + if ((edx & MASK) == MASK) { + return &br_chacha20_sse2_run; + } + } +#elif BR_SSE2_MSC + int info[4]; + + __cpuid(info, 1); + if (((uint32_t)info[3] & MASK) == MASK) { + return &br_chacha20_sse2_run; + } +#endif + return 0; + +#endif +} + +#else + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + return 0; +} + +#endif