X-Git-Url: https://www.bearssl.org/gitweb//home/git/?p=BearSSL;a=blobdiff_plain;f=src%2Fhash%2Fghash_pclmul.c;fp=src%2Fhash%2Fghash_pclmul.c;h=c70988933df551395cb5bb07fa120beb2c59d933;hp=0000000000000000000000000000000000000000;hb=5f045c759957fdff8c85716e6af99e10901fdac0;hpb=556e525d62cd5559e74fe4d2777a59d33590a033

diff --git a/src/hash/ghash_pclmul.c b/src/hash/ghash_pclmul.c
new file mode 100644
index 0000000..c709889
--- /dev/null
+++ b/src/hash/ghash_pclmul.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * This is the GHASH implementation that leverages the pclmulqdq opcode
+ * (from the AES-NI instructions).
+ */
+
+#if BR_AES_X86NI
+
+#if BR_AES_X86NI_GCC
+/* #pragma GCC target "sse2,ssse3,pclmul" */
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+#include <cpuid.h>
+#endif
+
+#if BR_AES_X86NI_MSC
+#include <intrin.h>
+#endif
+
+/* see bearssl_hash.h */
+BR_TARGET("ssse3,pclmul")
+void
+br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
+{
+	/*
+	 * TODO: loop below processes one 16-bit word at a time. We
+	 * could parallelize, using:
+	 *   ((y+x0)*h+x1)*h = (y+x0)*(h^2) + x1*h
+	 * i.e. precompute h^2, then handle two words at a time, mostly
+	 * in parallel (this may extend to more words as well...).
+	 */
+
+	const unsigned char *buf;
+	__m128i yx, hx;
+	__m128i h0, h1, h2;
+	__m128i byteswap_index;
+
+	byteswap_index = _mm_set_epi8(
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+	yx = _mm_loadu_si128(y);
+	hx = _mm_loadu_si128(h);
+	yx = _mm_shuffle_epi8(yx, byteswap_index);
+	hx = _mm_shuffle_epi8(hx, byteswap_index);
+
+	/*
+	 * We byte-swap y and h for full big-endian interpretation
+	 * (see below).
+	 */
+
+	h0 = hx;
+	h1 = _mm_shuffle_epi32(hx, 0x0E);
+	h2 = _mm_xor_si128(h0, h1);
+
+	buf = data;
+	while (len > 0) {
+		__m128i x;
+		__m128i t0, t1, t2, v0, v1, v2, v3;
+		__m128i y0, y1, y2;
+
+		/*
+		 * Load next 128-bit word. If there are not enough bytes
+		 * for the next word, we pad it with zeros (as per the
+		 * API for this function; it's also what is useful for
+		 * implementation of GCM).
+		 */
+		if (len >= 16) {
+			x = _mm_loadu_si128((const void *)buf);
+			buf += 16;
+			len -= 16;
+		} else {
+			unsigned char tmp[16];
+
+			memcpy(tmp, buf, len);
+			memset(tmp + len, 0, (sizeof tmp) - len);
+			x = _mm_loadu_si128((void *)tmp);
+			len = 0;
+		}
+
+		/*
+		 * Specification of GCM is basically "full little-endian",
+		 * i.e. leftmost bit is most significant; but decoding
+		 * performed by _mm_loadu_si128 is "mixed endian" (leftmost
+		 * _byte_ is least significant, but within each byte, the
+		 * leftmost _bit_ is most significant). We could reverse
+		 * bits in each byte; however, it is more efficient to
+		 * swap the bytes and thus emulate full big-endian
+		 * decoding.
+		 *
+		 * Big-endian works here because multiplication in
+		 * GF[2](X) is "carry-less", thereby allowing reversal:
+		 * if rev_n(x) consists in reversing the order of bits
+		 * in x, then:
+		 *   rev_128(A)*rev_128(B) = rev_255(A*B)
+		 * so we can compute A*B by using rev_128(A) and rev_128(B),
+		 * and an extra shift at the end (because 255 != 256). Bit
+		 * reversal is exactly what happens when converting from
+		 * full little-endian to full big-endian.
+		 */
+		x = _mm_shuffle_epi8(x, byteswap_index);
+		yx = _mm_xor_si128(yx, x);
+
+		/*
+		 * We want the product to be broken down into four
+		 * 64-bit values, because there is no SSE* opcode that
+		 * can do a shift on a 128-bit value.
+		 */
+		y0 = yx;
+		y1 = _mm_shuffle_epi32(yx, 0x0E);
+		y2 = _mm_xor_si128(y0, y1);
+		t0 = _mm_clmulepi64_si128(y0, h0, 0x00);
+		t1 = _mm_clmulepi64_si128(yx, hx, 0x11);
+		t2 = _mm_clmulepi64_si128(y2, h2, 0x00);
+		t2 = _mm_xor_si128(t2, _mm_xor_si128(t0, t1));
+		v0 = t0;
+		v1 = _mm_xor_si128(_mm_shuffle_epi32(t0, 0x0E), t2);
+		v2 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
+		v3 = _mm_shuffle_epi32(t1, 0x0E);
+
+		/*
+		 * Do the corrective 1-bit shift (255->256).
+		 */
+		v3 = _mm_or_si128(
+			_mm_slli_epi64(v3, 1),
+			_mm_srli_epi64(v2, 63));
+		v2 = _mm_or_si128(
+			_mm_slli_epi64(v2, 1),
+			_mm_srli_epi64(v1, 63));
+		v1 = _mm_or_si128(
+			_mm_slli_epi64(v1, 1),
+			_mm_srli_epi64(v0, 63));
+		v0 = _mm_slli_epi64(v0, 1);
+
+		/*
+		 * Perform polynomial reduction into GF(2^128).
+		 */
+		v2 = _mm_xor_si128(
+			v2,
+			_mm_xor_si128(
+				_mm_xor_si128(
+					v0,
+					_mm_srli_epi64(v0, 1)),
+				_mm_xor_si128(
+					_mm_srli_epi64(v0, 2),
+					_mm_srli_epi64(v0, 7))));
+		v1 = _mm_xor_si128(
+			_mm_xor_si128(
+				v1,
+				_mm_slli_epi64(v0, 63)),
+			_mm_xor_si128(
+				_mm_slli_epi64(v0, 62),
+				_mm_slli_epi64(v0, 57)));
+		v3 = _mm_xor_si128(
+			v3,
+			_mm_xor_si128(
+				_mm_xor_si128(
+					v1,
+					_mm_srli_epi64(v1, 1)),
+				_mm_xor_si128(
+					_mm_srli_epi64(v1, 2),
+					_mm_srli_epi64(v1, 7))));
+		v2 = _mm_xor_si128(
+			_mm_xor_si128(
+				v2,
+				_mm_slli_epi64(v1, 63)),
+			_mm_xor_si128(
+				_mm_slli_epi64(v1, 62),
+				_mm_slli_epi64(v1, 57)));
+
+		/*
+		 * We reduced toward the high words (v2 and v3), which
+		 * are the new value for y.
+		 */
+		yx = _mm_unpacklo_epi64(v2, v3);
+	}
+
+	yx = _mm_shuffle_epi8(yx, byteswap_index);
+	_mm_storeu_si128(y, yx);
+}
+
+/*
+ * Test CPU support for PCLMULQDQ.
+ */
+static int
+pclmul_supported(void)
+{
+	/*
+	 * Bit mask for features in ECX:
+	 *    1   PCLMULQDQ support
+	 */
+#define MASK   0x00000002
+
+#if BR_AES_X86NI_GCC
+	unsigned eax, ebx, ecx, edx;
+
+	if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+		return (ecx & MASK) == MASK;
+	} else {
+		return 0;
+	}
+#elif BR_AES_X86NI_MSC
+	int info[4];
+
+	__cpuid(info, 1);
+	return ((uint32_t)info[2] & MASK) == MASK;
+#else
+	return 0;
+#endif
+
+#undef MASK
+}
+
+/* see bearssl_hash.h */
+br_ghash
+br_ghash_pclmul_get(void)
+{
+	return pclmul_supported() ? &br_ghash_pclmul : 0;
+}
+
+#else
+
+/* see bearssl_hash.h */
+br_ghash
+br_ghash_pclmul_get(void)
+{
+	return 0;
+}
+
+#endif