0b32d51a19908420d707d52520d9725c6732a62b
[BearSSL] / src / symcipher / chacha20_sse2.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * This file contains a ChaCha20 implementation that leverages SSE2
29 * opcodes for better performance.
30 */
31
32 #if BR_SSE2
33
34 #if BR_SSE2_GCC
35 #include <emmintrin.h>
36 #include <cpuid.h>
37 #endif
38 #if BR_SSE2_MSC
39 #include <intrin.h>
40 #endif
41
42 /* see bearssl_block.h */
43 BR_TARGET("sse2")
44 uint32_t
45 br_chacha20_sse2_run(const void *key,
46 const void *iv, uint32_t cc, void *data, size_t len)
47 {
48 unsigned char *buf;
49 uint32_t ivtmp[4];
50 __m128i kw0, kw1;
51 __m128i iw, cw;
52 __m128i one;
53
54 static const uint32_t CW[] = {
55 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
56 };
57
58 buf = data;
59 kw0 = _mm_loadu_si128(key);
60 kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
61 ivtmp[0] = cc;
62 memcpy(ivtmp + 1, iv, 12);
63 iw = _mm_loadu_si128((const void *)ivtmp);
64 cw = _mm_loadu_si128((const void *)CW);
65 one = _mm_set_epi32(0, 0, 0, 1);
66
67 while (len > 0) {
68 /*
69 * sj contains state words 4*j to 4*j+3.
70 */
71 __m128i s0, s1, s2, s3;
72 int i;
73
74 s0 = cw;
75 s1 = kw0;
76 s2 = kw1;
77 s3 = iw;
78 for (i = 0; i < 10; i ++) {
79 /*
80 * Even round is straightforward application on
81 * the state words.
82 */
83 s0 = _mm_add_epi32(s0, s1);
84 s3 = _mm_xor_si128(s3, s0);
85 s3 = _mm_or_si128(
86 _mm_slli_epi32(s3, 16),
87 _mm_srli_epi32(s3, 16));
88
89 s2 = _mm_add_epi32(s2, s3);
90 s1 = _mm_xor_si128(s1, s2);
91 s1 = _mm_or_si128(
92 _mm_slli_epi32(s1, 12),
93 _mm_srli_epi32(s1, 20));
94
95 s0 = _mm_add_epi32(s0, s1);
96 s3 = _mm_xor_si128(s3, s0);
97 s3 = _mm_or_si128(
98 _mm_slli_epi32(s3, 8),
99 _mm_srli_epi32(s3, 24));
100
101 s2 = _mm_add_epi32(s2, s3);
102 s1 = _mm_xor_si128(s1, s2);
103 s1 = _mm_or_si128(
104 _mm_slli_epi32(s1, 7),
105 _mm_srli_epi32(s1, 25));
106
107 /*
108 * For the odd round, we must rotate some state
109 * words so that the computations apply on the
110 * right combinations of words.
111 */
112 s1 = _mm_shuffle_epi32(s1, 0x39);
113 s2 = _mm_shuffle_epi32(s2, 0x4E);
114 s3 = _mm_shuffle_epi32(s3, 0x93);
115
116 s0 = _mm_add_epi32(s0, s1);
117 s3 = _mm_xor_si128(s3, s0);
118 s3 = _mm_or_si128(
119 _mm_slli_epi32(s3, 16),
120 _mm_srli_epi32(s3, 16));
121
122 s2 = _mm_add_epi32(s2, s3);
123 s1 = _mm_xor_si128(s1, s2);
124 s1 = _mm_or_si128(
125 _mm_slli_epi32(s1, 12),
126 _mm_srli_epi32(s1, 20));
127
128 s0 = _mm_add_epi32(s0, s1);
129 s3 = _mm_xor_si128(s3, s0);
130 s3 = _mm_or_si128(
131 _mm_slli_epi32(s3, 8),
132 _mm_srli_epi32(s3, 24));
133
134 s2 = _mm_add_epi32(s2, s3);
135 s1 = _mm_xor_si128(s1, s2);
136 s1 = _mm_or_si128(
137 _mm_slli_epi32(s1, 7),
138 _mm_srli_epi32(s1, 25));
139
140 /*
141 * After the odd round, we rotate back the values
142 * to undo the rotate at the start of the odd round.
143 */
144 s1 = _mm_shuffle_epi32(s1, 0x93);
145 s2 = _mm_shuffle_epi32(s2, 0x4E);
146 s3 = _mm_shuffle_epi32(s3, 0x39);
147 }
148
149 /*
150 * Addition with the initial state.
151 */
152 s0 = _mm_add_epi32(s0, cw);
153 s1 = _mm_add_epi32(s1, kw0);
154 s2 = _mm_add_epi32(s2, kw1);
155 s3 = _mm_add_epi32(s3, iw);
156
157 /*
158 * Increment block counter.
159 */
160 iw = _mm_add_epi32(iw, one);
161
162 /*
163 * XOR final state with the data.
164 */
165 if (len < 64) {
166 unsigned char tmp[64];
167 size_t u;
168
169 _mm_storeu_si128((void *)(tmp + 0), s0);
170 _mm_storeu_si128((void *)(tmp + 16), s1);
171 _mm_storeu_si128((void *)(tmp + 32), s2);
172 _mm_storeu_si128((void *)(tmp + 48), s3);
173 for (u = 0; u < len; u ++) {
174 buf[u] ^= tmp[u];
175 }
176 break;
177 } else {
178 __m128i b0, b1, b2, b3;
179
180 b0 = _mm_loadu_si128((const void *)(buf + 0));
181 b1 = _mm_loadu_si128((const void *)(buf + 16));
182 b2 = _mm_loadu_si128((const void *)(buf + 32));
183 b3 = _mm_loadu_si128((const void *)(buf + 48));
184 b0 = _mm_xor_si128(b0, s0);
185 b1 = _mm_xor_si128(b1, s1);
186 b2 = _mm_xor_si128(b2, s2);
187 b3 = _mm_xor_si128(b3, s3);
188 _mm_storeu_si128((void *)(buf + 0), b0);
189 _mm_storeu_si128((void *)(buf + 16), b1);
190 _mm_storeu_si128((void *)(buf + 32), b2);
191 _mm_storeu_si128((void *)(buf + 48), b3);
192 buf += 64;
193 len -= 64;
194 }
195 }
196
197 /*
198 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
199 * raw SSE2, thus we use _mm_extract_epi16().
200 */
201 return (uint32_t)_mm_extract_epi16(iw, 0)
202 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
203 }
204
205 /* see bearssl_block.h */
206 br_chacha20_run
207 br_chacha20_sse2_get(void)
208 {
209 /*
210 * If using 64-bit mode, then SSE2 opcodes should be automatically
211 * available, since they are part of the ABI.
212 *
213 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
214 */
215
216 #if __x86_64__ || _M_X64
217
218 return &br_chacha20_sse2_run;
219
220 #else
221
222 /*
223 * SSE2 support is indicated by bit 26 in EDX.
224 */
225 #define MASK 0x04000000
226
227 #if BR_SSE2_GCC
228 unsigned eax, ebx, ecx, edx;
229
230 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
231 if ((edx & MASK) == MASK) {
232 return &br_chacha20_sse2_run;
233 }
234 }
235 #elif BR_SSE2_MSC
236 int info[4];
237
238 __cpuid(info, 1);
239 if (((uint32_t)info[3] & MASK) == MASK) {
240 return &br_chacha20_sse2_run;
241 }
242 #endif
243 return 0;
244
245 #endif
246 }
247
248 #else
249
250 /* see bearssl_block.h */
251 br_chacha20_run
252 br_chacha20_sse2_get(void)
253 {
254 return 0;
255 }
256
257 #endif