c70988933df551395cb5bb07fa120beb2c59d933
[BearSSL] / src / hash / ghash_pclmul.c
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "inner.h"
26
27 /*
28 * This is the GHASH implementation that leverages the pclmulqdq opcode
29 * (from the AES-NI instructions).
30 */
31
32 #if BR_AES_X86NI
33
34 #if BR_AES_X86NI_GCC
35 /* #pragma GCC target "sse2,ssse3,pclmul" */
36 #include <tmmintrin.h>
37 #include <wmmintrin.h>
38 #include <cpuid.h>
39 #endif
40
41 #if BR_AES_X86NI_MSC
42 #include <intrin.h>
43 #endif
44
45 /* see bearssl_hash.h */
46 BR_TARGET("ssse3,pclmul")
47 void
48 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
49 {
50 /*
51 * TODO: loop below processes one 16-bit word at a time. We
52 * could parallelize, using:
53 * ((y+x0)*h+x1)*h = (y+x0)*(h^2) + x1*h
54 * i.e. precompute h^2, then handle two words at a time, mostly
55 * in parallel (this may extend to more words as well...).
56 */
57
58 const unsigned char *buf;
59 __m128i yx, hx;
60 __m128i h0, h1, h2;
61 __m128i byteswap_index;
62
63 byteswap_index = _mm_set_epi8(
64 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
65 yx = _mm_loadu_si128(y);
66 hx = _mm_loadu_si128(h);
67 yx = _mm_shuffle_epi8(yx, byteswap_index);
68 hx = _mm_shuffle_epi8(hx, byteswap_index);
69
70 /*
71 * We byte-swap y and h for full big-endian interpretation
72 * (see below).
73 */
74
75 h0 = hx;
76 h1 = _mm_shuffle_epi32(hx, 0x0E);
77 h2 = _mm_xor_si128(h0, h1);
78
79 buf = data;
80 while (len > 0) {
81 __m128i x;
82 __m128i t0, t1, t2, v0, v1, v2, v3;
83 __m128i y0, y1, y2;
84
85 /*
86 * Load next 128-bit word. If there are not enough bytes
87 * for the next word, we pad it with zeros (as per the
88 * API for this function; it's also what is useful for
89 * implementation of GCM).
90 */
91 if (len >= 16) {
92 x = _mm_loadu_si128((const void *)buf);
93 buf += 16;
94 len -= 16;
95 } else {
96 unsigned char tmp[16];
97
98 memcpy(tmp, buf, len);
99 memset(tmp + len, 0, (sizeof tmp) - len);
100 x = _mm_loadu_si128((void *)tmp);
101 len = 0;
102 }
103
104 /*
105 * Specification of GCM is basically "full little-endian",
106 * i.e. leftmost bit is most significant; but decoding
107 * performed by _mm_loadu_si128 is "mixed endian" (leftmost
108 * _byte_ is least significant, but within each byte, the
109 * leftmost _bit_ is most significant). We could reverse
110 * bits in each byte; however, it is more efficient to
111 * swap the bytes and thus emulate full big-endian
112 * decoding.
113 *
114 * Big-endian works here because multiplication in
115 * GF[2](X) is "carry-less", thereby allowing reversal:
116 * if rev_n(x) consists in reversing the order of bits
117 * in x, then:
118 * rev_128(A)*rev_128(B) = rev_255(A*B)
119 * so we can compute A*B by using rev_128(A) and rev_128(B),
120 * and an extra shift at the end (because 255 != 256). Bit
121 * reversal is exactly what happens when converting from
122 * full little-endian to full big-endian.
123 */
124 x = _mm_shuffle_epi8(x, byteswap_index);
125 yx = _mm_xor_si128(yx, x);
126
127 /*
128 * We want the product to be broken down into four
129 * 64-bit values, because there is no SSE* opcode that
130 * can do a shift on a 128-bit value.
131 */
132 y0 = yx;
133 y1 = _mm_shuffle_epi32(yx, 0x0E);
134 y2 = _mm_xor_si128(y0, y1);
135 t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
136 t1 = _mm_clmulepi64_si128(yx, hx, 0x11);
137 t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
138 t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
139 v0 = t0;
140 v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
141 v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
142 v3 = _mm_shuffle_epi32(t1, 0x0E);
143
144 /*
145 * Do the corrective 1-bit shift (255->256).
146 */
147 v3 = _mm_or_si128(
148 _mm_slli_epi64(v3, 1),
149 _mm_srli_epi64(v2, 63));
150 v2 = _mm_or_si128(
151 _mm_slli_epi64(v2, 1),
152 _mm_srli_epi64(v1, 63));
153 v1 = _mm_or_si128(
154 _mm_slli_epi64(v1, 1),
155 _mm_srli_epi64(v0, 63));
156 v0 = _mm_slli_epi64(v0, 1);
157
158 /*
159 * Perform polynomial reduction into GF(2^128).
160 */
161 v2 = _mm_xor_si128(
162 v2,
163 _mm_xor_si128(
164 _mm_xor_si128(
165 v0,
166 _mm_srli_epi64(v0, 1)),
167 _mm_xor_si128(
168 _mm_srli_epi64(v0, 2),
169 _mm_srli_epi64(v0, 7))));
170 v1 = _mm_xor_si128(
171 _mm_xor_si128(
172 v1,
173 _mm_slli_epi64(v0, 63)),
174 _mm_xor_si128(
175 _mm_slli_epi64(v0, 62),
176 _mm_slli_epi64(v0, 57)));
177 v3 = _mm_xor_si128(
178 v3,
179 _mm_xor_si128(
180 _mm_xor_si128(
181 v1,
182 _mm_srli_epi64(v1, 1)),
183 _mm_xor_si128(
184 _mm_srli_epi64(v1, 2),
185 _mm_srli_epi64(v1, 7))));
186 v2 = _mm_xor_si128(
187 _mm_xor_si128(
188 v2,
189 _mm_slli_epi64(v1, 63)),
190 _mm_xor_si128(
191 _mm_slli_epi64(v1, 62),
192 _mm_slli_epi64(v1, 57)));
193
194 /*
195 * We reduced toward the high words (v2 and v3), which
196 * are the new value for y.
197 */
198 yx = _mm_unpacklo_epi64(v2, v3);
199 }
200
201 yx = _mm_shuffle_epi8(yx, byteswap_index);
202 _mm_storeu_si128(y, yx);
203 }
204
205 /*
206 * Test CPU support for PCLMULQDQ.
207 */
208 static int
209 pclmul_supported(void)
210 {
211 /*
212 * Bit mask for features in ECX:
213 * 1 PCLMULQDQ support
214 */
215 #define MASK 0x00000002
216
217 #if BR_AES_X86NI_GCC
218 unsigned eax, ebx, ecx, edx;
219
220 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
221 return (ecx & MASK) == MASK;
222 } else {
223 return 0;
224 }
225 #elif BR_AES_X86NI_MSC
226 int info[4];
227
228 __cpuid(info, 1);
229 return ((uint32_t)info[2] & MASK) == MASK;
230 #else
231 return 0;
232 #endif
233
234 #undef MASK
235 }
236
237 /* see bearssl_hash.h */
238 br_ghash
239 br_ghash_pclmul_get(void)
240 {
241 return pclmul_supported() ? &br_ghash_pclmul : 0;
242 }
243
244 #else
245
246 /* see bearssl_hash.h */
247 br_ghash
248 br_ghash_pclmul_get(void)
249 {
250 return 0;
251 }
252
253 #endif