Workaround for compiler bug (GCC 4.8 and 4.9 when targetting 32-bit x86).
[BearSSL] / src / hash / ghash_pclmul.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * This is the GHASH implementation that leverages the pclmulqdq opcode
29 * (from the AES-NI instructions).
30 */
31
32 #if BR_AES_X86NI
33
34 #if BR_AES_X86NI_GCC
35 #if BR_AES_X86NI_GCC_OLD
36 #pragma GCC push_options
37 #pragma GCC target("sse2,ssse3,pclmul")
38 #pragma GCC diagnostic ignored "-Wpsabi"
39 #endif
40 #include <tmmintrin.h>
41 #include <wmmintrin.h>
42 #include <cpuid.h>
43 #endif
44
45 #if BR_AES_X86NI_MSC
46 #include <intrin.h>
47 #endif
48
49 /*
50 * GHASH is defined over elements of GF(2^128) with "full little-endian"
51 * representation: leftmost byte is least significant, and, within each
52 * byte, leftmost _bit_ is least significant. The natural ordering in
53 * x86 is "mixed little-endian": bytes are ordered from least to most
54 * significant, but bits within a byte are in most-to-least significant
55 * order. Going to full little-endian representation would require
56 * reversing bits within each byte, which is doable but expensive.
57 *
58 * Instead, we go to full big-endian representation, by swapping bytes
59 * around, which is done with a single _mm_shuffle_epi8() opcode (it
60 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
61 * can use a full big-endian representation because in a carryless
62 * multiplication, we have a nice bit reversal property:
63 *
64 * rev_128(x) * rev_128(y) = rev_255(x * y)
65 *
66 * So by using full big-endian, we still get the right result, except
67 * that it is right-shifted by 1 bit. The left-shift is relatively
68 * inexpensive, and it can be mutualised.
69 *
70 *
71 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
72 * values with bit precision, we have to break down values into 64-bit
73 * chunks. We number chunks from 0 to 3 in left to right order.
74 */
75
76 /*
77 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
78 * halves of kw (into the right half of kx; left half is unspecified).
79 */
80 #define BK(kw, kx) do { \
81 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
82 } while (0)
83
84 /*
85 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
86 * the XOR of the two values (kx).
87 */
88 #define PBK(k0, k1, kw, kx) do { \
89 kw = _mm_unpacklo_epi64(k1, k0); \
90 kx = _mm_xor_si128(k0, k1); \
91 } while (0)
92
93 /*
94 * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
95 */
96 #define SL_256(x0, x1, x2, x3) do { \
97 x0 = _mm_or_si128( \
98 _mm_slli_epi64(x0, 1), \
99 _mm_srli_epi64(x1, 63)); \
100 x1 = _mm_or_si128( \
101 _mm_slli_epi64(x1, 1), \
102 _mm_srli_epi64(x2, 63)); \
103 x2 = _mm_or_si128( \
104 _mm_slli_epi64(x2, 1), \
105 _mm_srli_epi64(x3, 63)); \
106 x3 = _mm_slli_epi64(x3, 1); \
107 } while (0)
108
109 /*
110 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
111 * result is written in x0..x1.
112 */
113 #define REDUCE_F128(x0, x1, x2, x3) do { \
114 x1 = _mm_xor_si128( \
115 x1, \
116 _mm_xor_si128( \
117 _mm_xor_si128( \
118 x3, \
119 _mm_srli_epi64(x3, 1)), \
120 _mm_xor_si128( \
121 _mm_srli_epi64(x3, 2), \
122 _mm_srli_epi64(x3, 7)))); \
123 x2 = _mm_xor_si128( \
124 _mm_xor_si128( \
125 x2, \
126 _mm_slli_epi64(x3, 63)), \
127 _mm_xor_si128( \
128 _mm_slli_epi64(x3, 62), \
129 _mm_slli_epi64(x3, 57))); \
130 x0 = _mm_xor_si128( \
131 x0, \
132 _mm_xor_si128( \
133 _mm_xor_si128( \
134 x2, \
135 _mm_srli_epi64(x2, 1)), \
136 _mm_xor_si128( \
137 _mm_srli_epi64(x2, 2), \
138 _mm_srli_epi64(x2, 7)))); \
139 x1 = _mm_xor_si128( \
140 _mm_xor_si128( \
141 x1, \
142 _mm_slli_epi64(x2, 63)), \
143 _mm_xor_si128( \
144 _mm_slli_epi64(x2, 62), \
145 _mm_slli_epi64(x2, 57))); \
146 } while (0)
147
148 /*
149 * Square value kw into (dw,dx).
150 */
151 #define SQUARE_F128(kw, dw, dx) do { \
152 __m128i z0, z1, z2, z3; \
153 z1 = _mm_clmulepi64_si128(kw, kw, 0x11); \
154 z3 = _mm_clmulepi64_si128(kw, kw, 0x00); \
155 z0 = _mm_shuffle_epi32(z1, 0x0E); \
156 z2 = _mm_shuffle_epi32(z3, 0x0E); \
157 SL_256(z0, z1, z2, z3); \
158 REDUCE_F128(z0, z1, z2, z3); \
159 PBK(z0, z1, dw, dx); \
160 } while (0)
161
162 /* see bearssl_hash.h */
163 BR_TARGET("ssse3,pclmul")
164 void
165 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
166 {
167 const unsigned char *buf1, *buf2;
168 unsigned char tmp[64];
169 size_t num4, num1;
170 __m128i yw, h1w, h1x;
171 __m128i byteswap_index;
172
173 /*
174 * We split data into two chunks. First chunk starts at buf1
175 * and contains num4 blocks of 64-byte values. Second chunk
176 * starts at buf2 and contains num1 blocks of 16-byte values.
177 * We want the first chunk to be as large as possible.
178 */
179 buf1 = data;
180 num4 = len >> 6;
181 len &= 63;
182 buf2 = buf1 + (num4 << 6);
183 num1 = (len + 15) >> 4;
184 if ((len & 15) != 0) {
185 memcpy(tmp, buf2, len);
186 memset(tmp + len, 0, (num1 << 4) - len);
187 buf2 = tmp;
188 }
189
190 /*
191 * Constant value to perform endian conversion.
192 */
193 byteswap_index = _mm_set_epi8(
194 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
195
196 /*
197 * Load y and h.
198 */
199 yw = _mm_loadu_si128(y);
200 h1w = _mm_loadu_si128(h);
201 yw = _mm_shuffle_epi8(yw, byteswap_index);
202 h1w = _mm_shuffle_epi8(h1w, byteswap_index);
203 BK(h1w, h1x);
204
205 if (num4 > 0) {
206 __m128i h2w, h2x, h3w, h3x, h4w, h4x;
207 __m128i t0, t1, t2, t3;
208
209 /*
210 * Compute h2 = h^2.
211 */
212 SQUARE_F128(h1w, h2w, h2x);
213
214 /*
215 * Compute h3 = h^3 = h*(h^2).
216 */
217 t1 = _mm_clmulepi64_si128(h1w, h2w, 0x11);
218 t3 = _mm_clmulepi64_si128(h1w, h2w, 0x00);
219 t2 = _mm_xor_si128(_mm_clmulepi64_si128(h1x, h2x, 0x00),
220 _mm_xor_si128(t1, t3));
221 t0 = _mm_shuffle_epi32(t1, 0x0E);
222 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
223 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
224 SL_256(t0, t1, t2, t3);
225 REDUCE_F128(t0, t1, t2, t3);
226 PBK(t0, t1, h3w, h3x);
227
228 /*
229 * Compute h4 = h^4 = (h^2)^2.
230 */
231 SQUARE_F128(h2w, h4w, h4x);
232
233 while (num4 -- > 0) {
234 __m128i aw0, aw1, aw2, aw3;
235 __m128i ax0, ax1, ax2, ax3;
236
237 aw0 = _mm_loadu_si128((void *)(buf1 + 0));
238 aw1 = _mm_loadu_si128((void *)(buf1 + 16));
239 aw2 = _mm_loadu_si128((void *)(buf1 + 32));
240 aw3 = _mm_loadu_si128((void *)(buf1 + 48));
241 aw0 = _mm_shuffle_epi8(aw0, byteswap_index);
242 aw1 = _mm_shuffle_epi8(aw1, byteswap_index);
243 aw2 = _mm_shuffle_epi8(aw2, byteswap_index);
244 aw3 = _mm_shuffle_epi8(aw3, byteswap_index);
245 buf1 += 64;
246
247 aw0 = _mm_xor_si128(aw0, yw);
248 BK(aw1, ax1);
249 BK(aw2, ax2);
250 BK(aw3, ax3);
251 BK(aw0, ax0);
252
253 t1 = _mm_xor_si128(
254 _mm_xor_si128(
255 _mm_clmulepi64_si128(aw0, h4w, 0x11),
256 _mm_clmulepi64_si128(aw1, h3w, 0x11)),
257 _mm_xor_si128(
258 _mm_clmulepi64_si128(aw2, h2w, 0x11),
259 _mm_clmulepi64_si128(aw3, h1w, 0x11)));
260 t3 = _mm_xor_si128(
261 _mm_xor_si128(
262 _mm_clmulepi64_si128(aw0, h4w, 0x00),
263 _mm_clmulepi64_si128(aw1, h3w, 0x00)),
264 _mm_xor_si128(
265 _mm_clmulepi64_si128(aw2, h2w, 0x00),
266 _mm_clmulepi64_si128(aw3, h1w, 0x00)));
267 t2 = _mm_xor_si128(
268 _mm_xor_si128(
269 _mm_clmulepi64_si128(ax0, h4x, 0x00),
270 _mm_clmulepi64_si128(ax1, h3x, 0x00)),
271 _mm_xor_si128(
272 _mm_clmulepi64_si128(ax2, h2x, 0x00),
273 _mm_clmulepi64_si128(ax3, h1x, 0x00)));
274 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
275 t0 = _mm_shuffle_epi32(t1, 0x0E);
276 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
277 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
278 SL_256(t0, t1, t2, t3);
279 REDUCE_F128(t0, t1, t2, t3);
280 yw = _mm_unpacklo_epi64(t1, t0);
281 }
282 }
283
284 while (num1 -- > 0) {
285 __m128i aw, ax;
286 __m128i t0, t1, t2, t3;
287
288 aw = _mm_loadu_si128((void *)buf2);
289 aw = _mm_shuffle_epi8(aw, byteswap_index);
290 buf2 += 16;
291
292 aw = _mm_xor_si128(aw, yw);
293 BK(aw, ax);
294
295 t1 = _mm_clmulepi64_si128(aw, h1w, 0x11);
296 t3 = _mm_clmulepi64_si128(aw, h1w, 0x00);
297 t2 = _mm_clmulepi64_si128(ax, h1x, 0x00);
298 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
299 t0 = _mm_shuffle_epi32(t1, 0x0E);
300 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
301 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
302 SL_256(t0, t1, t2, t3);
303 REDUCE_F128(t0, t1, t2, t3);
304 yw = _mm_unpacklo_epi64(t1, t0);
305 }
306
307 yw = _mm_shuffle_epi8(yw, byteswap_index);
308 _mm_storeu_si128(y, yw);
309 }
310
311 /*
312 * Test CPU support for PCLMULQDQ.
313 */
314 static int
315 pclmul_supported(void)
316 {
317 /*
318 * Bit mask for features in ECX:
319 * 1 PCLMULQDQ support
320 */
321 #define MASK 0x00000002
322
323 #if BR_AES_X86NI_GCC
324 unsigned eax, ebx, ecx, edx;
325
326 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
327 return (ecx & MASK) == MASK;
328 } else {
329 return 0;
330 }
331 #elif BR_AES_X86NI_MSC
332 int info[4];
333
334 __cpuid(info, 1);
335 return ((uint32_t)info[2] & MASK) == MASK;
336 #else
337 return 0;
338 #endif
339
340 #undef MASK
341 }
342
343 /* see bearssl_hash.h */
344 br_ghash
345 br_ghash_pclmul_get(void)
346 {
347 return pclmul_supported() ? &br_ghash_pclmul : 0;
348 }
349
350 #if BR_AES_X86NI_GCC && BR_AES_X86NI_GCC_OLD
351 #pragma GCC pop_options
352 #endif
353
354 #else
355
356 /* see bearssl_hash.h */
357 br_ghash
358 br_ghash_pclmul_get(void)
359 {
360 return 0;
361 }
362
363 #endif