2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_POWER_ASM_MACROS 1
29 * This is the GHASH implementation that leverages the POWER8 opcodes.
35 * Some symbolic names for registers.
36 * HB0 = 16 bytes of value 0
37 * HB1 = 16 bytes of value 1
38 * HB2 = 16 bytes of value 2
39 * HB6 = 16 bytes of value 6
40 * HB7 = 16 bytes of value 7
41 * TT0, TT1 and TT2 are temporaries
43 * BSW holds the pattern for byteswapping 32-bit words; this is set only
44 * on little-endian systems. XBSW is the same register with the +32 offset
45 * for access with the VSX opcodes.
60 * Macro to initialise the constants.
71 * Fix endianness of a value after reading it or before writing it, if
75 #define INIT_BSW lxvw4x(XBSW, 0, %[idx2be])
76 #define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW)
79 #define FIX_ENDIAN(xx)
83 * Left-shift x0:x1 by one bit to the left. This is a corrective action
84 * needed because GHASH is defined in full little-endian specification,
85 * while the opcodes use full big-endian convention, so the 255-bit product
86 * ends up one bit to the right.
88 #define SL_256(x0, x1) \
89 vsldoi(TT0, HB0, x1, 1) \
96 * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97 * x0 or x1, or a different register). x0 and x1 are modified.
99 #define REDUCE_F128(xd, x0, x1) \
105 vxor(TT1, TT1, TT2) \
107 vsldoi(x1, x1, HB0, 15) \
116 vxor(TT1, TT1, TT2) \
119 /* see bearssl_hash.h */
121 br_ghash_pwr8(void *y
, const void *h
, const void *data
, size_t len
)
123 const unsigned char *buf1
, *buf2
;
125 unsigned char tmp
[64];
126 long cc0
, cc1
, cc2
, cc3
;
129 static const uint32_t idx2be
[] = {
130 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
137 * Assembly code requires data into two chunks; first chunk
138 * must contain a number of blocks which is a multiple of 4.
139 * Since the processing for the first chunk is faster, we want
140 * to make it as big as possible.
142 * For the remainder, there are two possibilities:
143 * -- if the remainder size is a multiple of 16, then use it
145 * -- otherwise, copy it to the tmp[] array and pad it with
149 buf2
= buf1
+ (num4
<< 6);
151 num1
= (len
+ 15) >> 4;
152 if ((len
& 15) != 0) {
153 memcpy(tmp
, buf2
, len
);
154 memset(tmp
+ len
, 0, (num1
<< 4) - len
);
166 * Load current h (denoted hereafter h1) in v9.
172 * Load current y into v28.
178 * Split h1 into three registers:
183 xxpermdi(49, 41, 41, 2)
184 vsldoi(18, HB0
, 9, 8)
185 vsldoi(19, 9, HB0
, 8)
188 * If num4 is 0, skip directly to the second chunk.
194 * Compute h2 = h*h in v10.
199 REDUCE_F128(10, 10, 11)
202 * Compute h3 = h*h*h in v11.
203 * We first split h2 into:
207 * Then we do the product with h1, and reduce into v11.
209 vsldoi(11, HB0
, 10, 8)
210 vsldoi(12, 10, HB0
, 8)
214 vsldoi(14, HB0
, 13, 8)
215 vsldoi(15, 13, HB0
, 8)
219 REDUCE_F128(11, 11, 12)
222 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
224 vsldoi(12, HB0
, 10, 8)
225 vsldoi(13, 10, HB0
, 8)
229 REDUCE_F128(12, 12, 13)
232 * Repack h1, h2, h3 and h4:
238 xxpermdi(45, 44, 43, 0)
239 xxpermdi(46, 44, 43, 3)
240 xxpermdi(47, 42, 41, 0)
241 xxpermdi(48, 42, 41, 3)
244 * Loop for each group of four blocks.
249 * Read the four next blocks.
255 lxvw4x(52, %[cc0
], %[buf1
])
256 lxvw4x(53, %[cc1
], %[buf1
])
257 lxvw4x(54, %[cc2
], %[buf1
])
258 lxvw4x(55, %[cc3
], %[buf1
])
263 addi(%[buf1
], %[buf1
], 64)
267 * Repack the blocks into v9, v10, v11 and v12.
273 xxpermdi(41, 52, 53, 0)
274 xxpermdi(42, 52, 53, 3)
275 xxpermdi(43, 54, 55, 0)
276 xxpermdi(44, 54, 55, 3)
279 * Compute the products.
280 * v20 = b0_0*h4_0 + b1_0*h3_0
281 * v21 = b0_1*h4_0 + b1_1*h3_0
282 * v22 = b0_0*h4_1 + b1_0*h3_1
283 * v23 = b0_1*h4_1 + b1_1*h3_1
284 * v24 = b2_0*h2_0 + b3_0*h1_0
285 * v25 = b2_1*h2_0 + b3_1*h1_0
286 * v26 = b2_0*h2_1 + b3_0*h1_1
287 * v27 = b2_1*h2_1 + b3_1*h1_1
299 * Sum products into a single 256-bit result in v11:v12.
306 vsldoi( 9, HB0
, 20, 8)
307 vsldoi(10, 20, HB0
, 8)
312 * Fix and reduce in GF(2^128); this is the new y (in v28).
315 REDUCE_F128(28, 11, 12)
318 * Loop for next group of four blocks.
323 * Process second chunk, one block at a time.
332 * Load next data block and XOR it into y.
334 lxvw4x(41, 0, %[buf2
])
338 addi(%[buf2
], %[buf2
], 16)
342 * Split y into doublewords:
347 vsldoi(10, HB0
, 9, 8)
348 vsldoi(11, 9, HB0
, 8)
351 * Compute products with h:
354 * v14 = y_1 * h_0 + y_0 * h_1
361 * Propagate v14 into v12:v13 to finalise product.
363 vsldoi(10, HB0
, 14, 8)
364 vsldoi(11, 14, HB0
, 8)
369 * Fix result and reduce into v28 (next value for y).
372 REDUCE_F128(28, 12, 13)
377 * Write back the new y.
382 : [buf1
] "+b" (buf1
), [buf2
] "+b" (buf2
)
383 : [y
] "b" (y
), [h
] "b" (h
), [num4
] "b" (num4
), [num1
] "b" (num1
),
384 [cc0
] "b" (cc0
), [cc1
] "b" (cc1
), [cc2
] "b" (cc2
), [cc3
] "b" (cc3
)
386 , [idx2be
] "b" (idx2be
)
388 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
395 /* see bearssl_hash.h */
397 br_ghash_pwr8_get(void)
399 return &br_ghash_pwr8
;
404 /* see bearssl_hash.h */
406 br_ghash_pwr8_get(void)