+/*
+ * GHASH is defined over elements of GF(2^128) with "full little-endian"
+ * representation: leftmost byte is least significant, and, within each
+ * byte, leftmost _bit_ is least significant. The natural ordering in
+ * x86 is "mixed little-endian": bytes are ordered from least to most
+ * significant, but bits within a byte are in most-to-least significant
+ * order. Going to full little-endian representation would require
+ * reversing bits within each byte, which is doable but expensive.
+ *
+ * Instead, we go to full big-endian representation, by swapping bytes
+ * around, which is done with a single _mm_shuffle_epi8() opcode (it
+ * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
+ * can use a full big-endian representation because in a carryless
+ * multiplication, we have a nice bit reversal property:
+ *
+ * rev_128(x) * rev_128(y) = rev_255(x * y)
+ *
+ * So by using full big-endian, we still get the right result, except
+ * that it is right-shifted by 1 bit. The left-shift is relatively
+ * inexpensive, and it can be mutualised.
+ *
+ *
+ * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
+ * values with bit precision, we have to break down values into 64-bit
+ * chunks. We number chunks from 0 to 3 in left to right order.
+ */
+
+/*
+ * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
+ * halves of kw (into the right half of kx; left half is unspecified).
+ */
+#define BK(kw, kx) do { \
+ kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
+ } while (0)
+
+/*
+ * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
+ * the XOR of the two values (kx).
+ */
+#define PBK(k0, k1, kw, kx) do { \
+ kw = _mm_unpacklo_epi64(k1, k0); \
+ kx = _mm_xor_si128(k0, k1); \
+ } while (0)
+
+/*
+ * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
+ */
+#define SL_256(x0, x1, x2, x3) do { \
+ x0 = _mm_or_si128( \
+ _mm_slli_epi64(x0, 1), \
+ _mm_srli_epi64(x1, 63)); \
+ x1 = _mm_or_si128( \
+ _mm_slli_epi64(x1, 1), \
+ _mm_srli_epi64(x2, 63)); \
+ x2 = _mm_or_si128( \
+ _mm_slli_epi64(x2, 1), \
+ _mm_srli_epi64(x3, 63)); \
+ x3 = _mm_slli_epi64(x3, 1); \
+ } while (0)
+
+/*
+ * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
+ * result is written in x0..x1.
+ */
+#define REDUCE_F128(x0, x1, x2, x3) do { \
+ x1 = _mm_xor_si128( \
+ x1, \
+ _mm_xor_si128( \
+ _mm_xor_si128( \
+ x3, \
+ _mm_srli_epi64(x3, 1)), \
+ _mm_xor_si128( \
+ _mm_srli_epi64(x3, 2), \
+ _mm_srli_epi64(x3, 7)))); \
+ x2 = _mm_xor_si128( \
+ _mm_xor_si128( \
+ x2, \
+ _mm_slli_epi64(x3, 63)), \
+ _mm_xor_si128( \
+ _mm_slli_epi64(x3, 62), \
+ _mm_slli_epi64(x3, 57))); \
+ x0 = _mm_xor_si128( \
+ x0, \
+ _mm_xor_si128( \
+ _mm_xor_si128( \
+ x2, \
+ _mm_srli_epi64(x2, 1)), \
+ _mm_xor_si128( \
+ _mm_srli_epi64(x2, 2), \
+ _mm_srli_epi64(x2, 7)))); \
+ x1 = _mm_xor_si128( \
+ _mm_xor_si128( \
+ x1, \
+ _mm_slli_epi64(x2, 63)), \
+ _mm_xor_si128( \
+ _mm_slli_epi64(x2, 62), \
+ _mm_slli_epi64(x2, 57))); \
+ } while (0)
+
+/*
+ * Square value kw into (dw,dx).
+ */
+#define SQUARE_F128(kw, dw, dx) do { \
+ __m128i z0, z1, z2, z3; \
+ z1 = _mm_clmulepi64_si128(kw, kw, 0x11); \
+ z3 = _mm_clmulepi64_si128(kw, kw, 0x00); \
+ z0 = _mm_shuffle_epi32(z1, 0x0E); \
+ z2 = _mm_shuffle_epi32(z3, 0x0E); \
+ SL_256(z0, z1, z2, z3); \
+ REDUCE_F128(z0, z1, z2, z3); \
+ PBK(z0, z1, dw, dx); \
+ } while (0)
+