0b32d51a19908420d707d52520d9725c6732a62b
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * This file contains a ChaCha20 implementation that leverages SSE2
29 * opcodes for better performance.
35 #include <emmintrin.h>
42 /* see bearssl_block.h */
45 br_chacha20_sse2_run(const void *key
,
46 const void *iv
, uint32_t cc
, void *data
, size_t len
)
54 static const uint32_t CW
[] = {
55 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
59 kw0
= _mm_loadu_si128(key
);
60 kw1
= _mm_loadu_si128((const void *)((const unsigned char *)key
+ 16));
62 memcpy(ivtmp
+ 1, iv
, 12);
63 iw
= _mm_loadu_si128((const void *)ivtmp
);
64 cw
= _mm_loadu_si128((const void *)CW
);
65 one
= _mm_set_epi32(0, 0, 0, 1);
69 * sj contains state words 4*j to 4*j+3.
71 __m128i s0
, s1
, s2
, s3
;
78 for (i
= 0; i
< 10; i
++) {
80 * Even round is straightforward application on
83 s0
= _mm_add_epi32(s0
, s1
);
84 s3
= _mm_xor_si128(s3
, s0
);
86 _mm_slli_epi32(s3
, 16),
87 _mm_srli_epi32(s3
, 16));
89 s2
= _mm_add_epi32(s2
, s3
);
90 s1
= _mm_xor_si128(s1
, s2
);
92 _mm_slli_epi32(s1
, 12),
93 _mm_srli_epi32(s1
, 20));
95 s0
= _mm_add_epi32(s0
, s1
);
96 s3
= _mm_xor_si128(s3
, s0
);
98 _mm_slli_epi32(s3
, 8),
99 _mm_srli_epi32(s3
, 24));
101 s2
= _mm_add_epi32(s2
, s3
);
102 s1
= _mm_xor_si128(s1
, s2
);
104 _mm_slli_epi32(s1
, 7),
105 _mm_srli_epi32(s1
, 25));
108 * For the odd round, we must rotate some state
109 * words so that the computations apply on the
110 * right combinations of words.
112 s1
= _mm_shuffle_epi32(s1
, 0x39);
113 s2
= _mm_shuffle_epi32(s2
, 0x4E);
114 s3
= _mm_shuffle_epi32(s3
, 0x93);
116 s0
= _mm_add_epi32(s0
, s1
);
117 s3
= _mm_xor_si128(s3
, s0
);
119 _mm_slli_epi32(s3
, 16),
120 _mm_srli_epi32(s3
, 16));
122 s2
= _mm_add_epi32(s2
, s3
);
123 s1
= _mm_xor_si128(s1
, s2
);
125 _mm_slli_epi32(s1
, 12),
126 _mm_srli_epi32(s1
, 20));
128 s0
= _mm_add_epi32(s0
, s1
);
129 s3
= _mm_xor_si128(s3
, s0
);
131 _mm_slli_epi32(s3
, 8),
132 _mm_srli_epi32(s3
, 24));
134 s2
= _mm_add_epi32(s2
, s3
);
135 s1
= _mm_xor_si128(s1
, s2
);
137 _mm_slli_epi32(s1
, 7),
138 _mm_srli_epi32(s1
, 25));
141 * After the odd round, we rotate back the values
142 * to undo the rotate at the start of the odd round.
144 s1
= _mm_shuffle_epi32(s1
, 0x93);
145 s2
= _mm_shuffle_epi32(s2
, 0x4E);
146 s3
= _mm_shuffle_epi32(s3
, 0x39);
150 * Addition with the initial state.
152 s0
= _mm_add_epi32(s0
, cw
);
153 s1
= _mm_add_epi32(s1
, kw0
);
154 s2
= _mm_add_epi32(s2
, kw1
);
155 s3
= _mm_add_epi32(s3
, iw
);
158 * Increment block counter.
160 iw
= _mm_add_epi32(iw
, one
);
163 * XOR final state with the data.
166 unsigned char tmp
[64];
169 _mm_storeu_si128((void *)(tmp
+ 0), s0
);
170 _mm_storeu_si128((void *)(tmp
+ 16), s1
);
171 _mm_storeu_si128((void *)(tmp
+ 32), s2
);
172 _mm_storeu_si128((void *)(tmp
+ 48), s3
);
173 for (u
= 0; u
< len
; u
++) {
178 __m128i b0
, b1
, b2
, b3
;
180 b0
= _mm_loadu_si128((const void *)(buf
+ 0));
181 b1
= _mm_loadu_si128((const void *)(buf
+ 16));
182 b2
= _mm_loadu_si128((const void *)(buf
+ 32));
183 b3
= _mm_loadu_si128((const void *)(buf
+ 48));
184 b0
= _mm_xor_si128(b0
, s0
);
185 b1
= _mm_xor_si128(b1
, s1
);
186 b2
= _mm_xor_si128(b2
, s2
);
187 b3
= _mm_xor_si128(b3
, s3
);
188 _mm_storeu_si128((void *)(buf
+ 0), b0
);
189 _mm_storeu_si128((void *)(buf
+ 16), b1
);
190 _mm_storeu_si128((void *)(buf
+ 32), b2
);
191 _mm_storeu_si128((void *)(buf
+ 48), b3
);
198 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
199 * raw SSE2, thus we use _mm_extract_epi16().
201 return (uint32_t)_mm_extract_epi16(iw
, 0)
202 | ((uint32_t)_mm_extract_epi16(iw
, 1) << 16);
205 /* see bearssl_block.h */
207 br_chacha20_sse2_get(void)
210 * If using 64-bit mode, then SSE2 opcodes should be automatically
211 * available, since they are part of the ABI.
213 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
216 #if __x86_64__ || _M_X64
218 return &br_chacha20_sse2_run
;
223 * SSE2 support is indicated by bit 26 in EDX.
225 #define MASK 0x04000000
228 unsigned eax
, ebx
, ecx
, edx
;
230 if (__get_cpuid(1, &eax
, &ebx
, &ecx
, &edx
)) {
231 if ((edx
& MASK
) == MASK
) {
232 return &br_chacha20_sse2_run
;
239 if (((uint32_t)info
[3] & MASK
) == MASK
) {
240 return &br_chacha20_sse2_run
;
250 /* see bearssl_block.h */
252 br_chacha20_sse2_get(void)