2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_ENABLE_INTRINSICS 1
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class
*
32 br_aes_x86ni_ctrcbc_get_vtable(void)
34 return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable
: NULL
;
37 /* see bearssl_block.h */
39 br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys
*ctx
,
40 const void *key
, size_t len
)
42 ctx
->vtable
= &br_aes_x86ni_ctrcbc_vtable
;
43 ctx
->num_rounds
= br_aes_x86ni_keysched_enc(ctx
->skey
.skni
, key
, len
);
48 /* see bearssl_block.h */
49 BR_TARGET("sse2,sse4.1,aes")
51 br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys
*ctx
,
52 void *ctr
, void *data
, size_t len
)
57 __m128i ivx0
, ivx1
, ivx2
, ivx3
;
58 __m128i erev
, zero
, one
, four
, notthree
;
62 num_rounds
= ctx
->num_rounds
;
63 for (u
= 0; u
<= num_rounds
; u
++) {
64 sk
[u
] = _mm_loadu_si128((void *)(ctx
->skey
.skni
+ (u
<< 4)));
68 * Some SSE2 constants.
70 erev
= _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
71 8, 9, 10, 11, 12, 13, 14, 15);
72 zero
= _mm_setzero_si128();
73 one
= _mm_set_epi64x(0, 1);
74 four
= _mm_set_epi64x(0, 4);
75 notthree
= _mm_sub_epi64(zero
, four
);
78 * Decode the counter in big-endian and pre-increment the other
81 ivx0
= _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr
), erev
);
82 ivx1
= _mm_add_epi64(ivx0
, one
);
83 ivx1
= _mm_sub_epi64(ivx1
,
84 _mm_slli_si128(_mm_cmpeq_epi64(ivx1
, zero
), 8));
85 ivx2
= _mm_add_epi64(ivx1
, one
);
86 ivx2
= _mm_sub_epi64(ivx2
,
87 _mm_slli_si128(_mm_cmpeq_epi64(ivx2
, zero
), 8));
88 ivx3
= _mm_add_epi64(ivx2
, one
);
89 ivx3
= _mm_sub_epi64(ivx3
,
90 _mm_slli_si128(_mm_cmpeq_epi64(ivx3
, zero
), 8));
92 __m128i x0
, x1
, x2
, x3
;
95 * Load counter values; we need to byteswap them because
96 * the specification says that they use big-endian.
98 x0
= _mm_shuffle_epi8(ivx0
, erev
);
99 x1
= _mm_shuffle_epi8(ivx1
, erev
);
100 x2
= _mm_shuffle_epi8(ivx2
, erev
);
101 x3
= _mm_shuffle_epi8(ivx3
, erev
);
103 x0
= _mm_xor_si128(x0
, sk
[0]);
104 x1
= _mm_xor_si128(x1
, sk
[0]);
105 x2
= _mm_xor_si128(x2
, sk
[0]);
106 x3
= _mm_xor_si128(x3
, sk
[0]);
107 x0
= _mm_aesenc_si128(x0
, sk
[1]);
108 x1
= _mm_aesenc_si128(x1
, sk
[1]);
109 x2
= _mm_aesenc_si128(x2
, sk
[1]);
110 x3
= _mm_aesenc_si128(x3
, sk
[1]);
111 x0
= _mm_aesenc_si128(x0
, sk
[2]);
112 x1
= _mm_aesenc_si128(x1
, sk
[2]);
113 x2
= _mm_aesenc_si128(x2
, sk
[2]);
114 x3
= _mm_aesenc_si128(x3
, sk
[2]);
115 x0
= _mm_aesenc_si128(x0
, sk
[3]);
116 x1
= _mm_aesenc_si128(x1
, sk
[3]);
117 x2
= _mm_aesenc_si128(x2
, sk
[3]);
118 x3
= _mm_aesenc_si128(x3
, sk
[3]);
119 x0
= _mm_aesenc_si128(x0
, sk
[4]);
120 x1
= _mm_aesenc_si128(x1
, sk
[4]);
121 x2
= _mm_aesenc_si128(x2
, sk
[4]);
122 x3
= _mm_aesenc_si128(x3
, sk
[4]);
123 x0
= _mm_aesenc_si128(x0
, sk
[5]);
124 x1
= _mm_aesenc_si128(x1
, sk
[5]);
125 x2
= _mm_aesenc_si128(x2
, sk
[5]);
126 x3
= _mm_aesenc_si128(x3
, sk
[5]);
127 x0
= _mm_aesenc_si128(x0
, sk
[6]);
128 x1
= _mm_aesenc_si128(x1
, sk
[6]);
129 x2
= _mm_aesenc_si128(x2
, sk
[6]);
130 x3
= _mm_aesenc_si128(x3
, sk
[6]);
131 x0
= _mm_aesenc_si128(x0
, sk
[7]);
132 x1
= _mm_aesenc_si128(x1
, sk
[7]);
133 x2
= _mm_aesenc_si128(x2
, sk
[7]);
134 x3
= _mm_aesenc_si128(x3
, sk
[7]);
135 x0
= _mm_aesenc_si128(x0
, sk
[8]);
136 x1
= _mm_aesenc_si128(x1
, sk
[8]);
137 x2
= _mm_aesenc_si128(x2
, sk
[8]);
138 x3
= _mm_aesenc_si128(x3
, sk
[8]);
139 x0
= _mm_aesenc_si128(x0
, sk
[9]);
140 x1
= _mm_aesenc_si128(x1
, sk
[9]);
141 x2
= _mm_aesenc_si128(x2
, sk
[9]);
142 x3
= _mm_aesenc_si128(x3
, sk
[9]);
143 if (num_rounds
== 10) {
144 x0
= _mm_aesenclast_si128(x0
, sk
[10]);
145 x1
= _mm_aesenclast_si128(x1
, sk
[10]);
146 x2
= _mm_aesenclast_si128(x2
, sk
[10]);
147 x3
= _mm_aesenclast_si128(x3
, sk
[10]);
148 } else if (num_rounds
== 12) {
149 x0
= _mm_aesenc_si128(x0
, sk
[10]);
150 x1
= _mm_aesenc_si128(x1
, sk
[10]);
151 x2
= _mm_aesenc_si128(x2
, sk
[10]);
152 x3
= _mm_aesenc_si128(x3
, sk
[10]);
153 x0
= _mm_aesenc_si128(x0
, sk
[11]);
154 x1
= _mm_aesenc_si128(x1
, sk
[11]);
155 x2
= _mm_aesenc_si128(x2
, sk
[11]);
156 x3
= _mm_aesenc_si128(x3
, sk
[11]);
157 x0
= _mm_aesenclast_si128(x0
, sk
[12]);
158 x1
= _mm_aesenclast_si128(x1
, sk
[12]);
159 x2
= _mm_aesenclast_si128(x2
, sk
[12]);
160 x3
= _mm_aesenclast_si128(x3
, sk
[12]);
162 x0
= _mm_aesenc_si128(x0
, sk
[10]);
163 x1
= _mm_aesenc_si128(x1
, sk
[10]);
164 x2
= _mm_aesenc_si128(x2
, sk
[10]);
165 x3
= _mm_aesenc_si128(x3
, sk
[10]);
166 x0
= _mm_aesenc_si128(x0
, sk
[11]);
167 x1
= _mm_aesenc_si128(x1
, sk
[11]);
168 x2
= _mm_aesenc_si128(x2
, sk
[11]);
169 x3
= _mm_aesenc_si128(x3
, sk
[11]);
170 x0
= _mm_aesenc_si128(x0
, sk
[12]);
171 x1
= _mm_aesenc_si128(x1
, sk
[12]);
172 x2
= _mm_aesenc_si128(x2
, sk
[12]);
173 x3
= _mm_aesenc_si128(x3
, sk
[12]);
174 x0
= _mm_aesenc_si128(x0
, sk
[13]);
175 x1
= _mm_aesenc_si128(x1
, sk
[13]);
176 x2
= _mm_aesenc_si128(x2
, sk
[13]);
177 x3
= _mm_aesenc_si128(x3
, sk
[13]);
178 x0
= _mm_aesenclast_si128(x0
, sk
[14]);
179 x1
= _mm_aesenclast_si128(x1
, sk
[14]);
180 x2
= _mm_aesenclast_si128(x2
, sk
[14]);
181 x3
= _mm_aesenclast_si128(x3
, sk
[14]);
184 x0
= _mm_xor_si128(x0
,
185 _mm_loadu_si128((void *)(buf
+ 0)));
186 x1
= _mm_xor_si128(x1
,
187 _mm_loadu_si128((void *)(buf
+ 16)));
188 x2
= _mm_xor_si128(x2
,
189 _mm_loadu_si128((void *)(buf
+ 32)));
190 x3
= _mm_xor_si128(x3
,
191 _mm_loadu_si128((void *)(buf
+ 48)));
192 _mm_storeu_si128((void *)(buf
+ 0), x0
);
193 _mm_storeu_si128((void *)(buf
+ 16), x1
);
194 _mm_storeu_si128((void *)(buf
+ 32), x2
);
195 _mm_storeu_si128((void *)(buf
+ 48), x3
);
199 unsigned char tmp
[64];
201 _mm_storeu_si128((void *)(tmp
+ 0), x0
);
202 _mm_storeu_si128((void *)(tmp
+ 16), x1
);
203 _mm_storeu_si128((void *)(tmp
+ 32), x2
);
204 _mm_storeu_si128((void *)(tmp
+ 48), x3
);
205 for (u
= 0; u
< len
; u
++) {
223 * Add 4 to each counter value. For carry propagation
224 * into the upper 64-bit words, we would need to compare
225 * the results with 4, but SSE2+ has only _signed_
226 * comparisons. Instead, we mask out the low two bits,
227 * and check whether the remaining bits are zero.
229 ivx0
= _mm_add_epi64(ivx0
, four
);
230 ivx1
= _mm_add_epi64(ivx1
, four
);
231 ivx2
= _mm_add_epi64(ivx2
, four
);
232 ivx3
= _mm_add_epi64(ivx3
, four
);
233 ivx0
= _mm_sub_epi64(ivx0
,
234 _mm_slli_si128(_mm_cmpeq_epi64(
235 _mm_and_si128(ivx0
, notthree
), zero
), 8));
236 ivx1
= _mm_sub_epi64(ivx1
,
237 _mm_slli_si128(_mm_cmpeq_epi64(
238 _mm_and_si128(ivx1
, notthree
), zero
), 8));
239 ivx2
= _mm_sub_epi64(ivx2
,
240 _mm_slli_si128(_mm_cmpeq_epi64(
241 _mm_and_si128(ivx2
, notthree
), zero
), 8));
242 ivx3
= _mm_sub_epi64(ivx3
,
243 _mm_slli_si128(_mm_cmpeq_epi64(
244 _mm_and_si128(ivx3
, notthree
), zero
), 8));
248 * Write back new counter value. The loop took care to put the
249 * right counter value in ivx0.
251 _mm_storeu_si128((void *)ctr
, _mm_shuffle_epi8(ivx0
, erev
));
254 /* see bearssl_block.h */
255 BR_TARGET("sse2,sse4.1,aes")
257 br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys
*ctx
,
258 void *cbcmac
, const void *data
, size_t len
)
260 const unsigned char *buf
;
266 ivx
= _mm_loadu_si128(cbcmac
);
267 num_rounds
= ctx
->num_rounds
;
268 for (u
= 0; u
<= num_rounds
; u
++) {
269 sk
[u
] = _mm_loadu_si128((void *)(ctx
->skey
.skni
+ (u
<< 4)));
274 x
= _mm_xor_si128(_mm_loadu_si128((void *)buf
), ivx
);
275 x
= _mm_xor_si128(x
, sk
[0]);
276 x
= _mm_aesenc_si128(x
, sk
[1]);
277 x
= _mm_aesenc_si128(x
, sk
[2]);
278 x
= _mm_aesenc_si128(x
, sk
[3]);
279 x
= _mm_aesenc_si128(x
, sk
[4]);
280 x
= _mm_aesenc_si128(x
, sk
[5]);
281 x
= _mm_aesenc_si128(x
, sk
[6]);
282 x
= _mm_aesenc_si128(x
, sk
[7]);
283 x
= _mm_aesenc_si128(x
, sk
[8]);
284 x
= _mm_aesenc_si128(x
, sk
[9]);
285 if (num_rounds
== 10) {
286 x
= _mm_aesenclast_si128(x
, sk
[10]);
287 } else if (num_rounds
== 12) {
288 x
= _mm_aesenc_si128(x
, sk
[10]);
289 x
= _mm_aesenc_si128(x
, sk
[11]);
290 x
= _mm_aesenclast_si128(x
, sk
[12]);
292 x
= _mm_aesenc_si128(x
, sk
[10]);
293 x
= _mm_aesenc_si128(x
, sk
[11]);
294 x
= _mm_aesenc_si128(x
, sk
[12]);
295 x
= _mm_aesenc_si128(x
, sk
[13]);
296 x
= _mm_aesenclast_si128(x
, sk
[14]);
302 _mm_storeu_si128(cbcmac
, ivx
);
305 /* see bearssl_block.h */
306 BR_TARGET("sse2,sse4.1,aes")
308 br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys
*ctx
,
309 void *ctr
, void *cbcmac
, void *data
, size_t len
)
315 __m128i erev
, zero
, one
;
319 num_rounds
= ctx
->num_rounds
;
320 for (u
= 0; u
<= num_rounds
; u
++) {
321 sk
[u
] = _mm_loadu_si128((void *)(ctx
->skey
.skni
+ (u
<< 4)));
325 * Some SSE2 constants.
327 erev
= _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
328 8, 9, 10, 11, 12, 13, 14, 15);
329 zero
= _mm_setzero_si128();
330 one
= _mm_set_epi64x(0, 1);
333 * Decode the counter in big-endian.
335 ivx
= _mm_shuffle_epi8(_mm_loadu_si128(ctr
), erev
);
336 cmx
= _mm_loadu_si128(cbcmac
);
344 * Load initial values:
345 * dx encrypted block of data
346 * x0 counter (for CTR encryption)
347 * x1 input for CBC-MAC
349 dx
= _mm_loadu_si128((void *)buf
);
350 x0
= _mm_shuffle_epi8(ivx
, erev
);
353 x0
= _mm_xor_si128(x0
, sk
[0]);
354 x1
= _mm_xor_si128(x1
, sk
[0]);
355 x0
= _mm_aesenc_si128(x0
, sk
[1]);
356 x1
= _mm_aesenc_si128(x1
, sk
[1]);
357 x0
= _mm_aesenc_si128(x0
, sk
[2]);
358 x1
= _mm_aesenc_si128(x1
, sk
[2]);
359 x0
= _mm_aesenc_si128(x0
, sk
[3]);
360 x1
= _mm_aesenc_si128(x1
, sk
[3]);
361 x0
= _mm_aesenc_si128(x0
, sk
[4]);
362 x1
= _mm_aesenc_si128(x1
, sk
[4]);
363 x0
= _mm_aesenc_si128(x0
, sk
[5]);
364 x1
= _mm_aesenc_si128(x1
, sk
[5]);
365 x0
= _mm_aesenc_si128(x0
, sk
[6]);
366 x1
= _mm_aesenc_si128(x1
, sk
[6]);
367 x0
= _mm_aesenc_si128(x0
, sk
[7]);
368 x1
= _mm_aesenc_si128(x1
, sk
[7]);
369 x0
= _mm_aesenc_si128(x0
, sk
[8]);
370 x1
= _mm_aesenc_si128(x1
, sk
[8]);
371 x0
= _mm_aesenc_si128(x0
, sk
[9]);
372 x1
= _mm_aesenc_si128(x1
, sk
[9]);
373 if (num_rounds
== 10) {
374 x0
= _mm_aesenclast_si128(x0
, sk
[10]);
375 x1
= _mm_aesenclast_si128(x1
, sk
[10]);
376 } else if (num_rounds
== 12) {
377 x0
= _mm_aesenc_si128(x0
, sk
[10]);
378 x1
= _mm_aesenc_si128(x1
, sk
[10]);
379 x0
= _mm_aesenc_si128(x0
, sk
[11]);
380 x1
= _mm_aesenc_si128(x1
, sk
[11]);
381 x0
= _mm_aesenclast_si128(x0
, sk
[12]);
382 x1
= _mm_aesenclast_si128(x1
, sk
[12]);
384 x0
= _mm_aesenc_si128(x0
, sk
[10]);
385 x1
= _mm_aesenc_si128(x1
, sk
[10]);
386 x0
= _mm_aesenc_si128(x0
, sk
[11]);
387 x1
= _mm_aesenc_si128(x1
, sk
[11]);
388 x0
= _mm_aesenc_si128(x0
, sk
[12]);
389 x1
= _mm_aesenc_si128(x1
, sk
[12]);
390 x0
= _mm_aesenc_si128(x0
, sk
[13]);
391 x1
= _mm_aesenc_si128(x1
, sk
[13]);
392 x0
= _mm_aesenclast_si128(x0
, sk
[14]);
393 x1
= _mm_aesenclast_si128(x1
, sk
[14]);
396 x0
= _mm_xor_si128(x0
, dx
);
398 cmx
= _mm_xor_si128(cmx
, x0
);
401 cmx
= _mm_xor_si128(x1
, x0
);
403 _mm_storeu_si128((void *)buf
, x0
);
409 * Increment the counter value.
411 ivx
= _mm_add_epi64(ivx
, one
);
412 ivx
= _mm_sub_epi64(ivx
,
413 _mm_slli_si128(_mm_cmpeq_epi64(ivx
, zero
), 8));
416 * If this was the last iteration, then compute the
417 * extra block encryption to complete CBC-MAC.
420 cmx
= _mm_xor_si128(cmx
, sk
[0]);
421 cmx
= _mm_aesenc_si128(cmx
, sk
[1]);
422 cmx
= _mm_aesenc_si128(cmx
, sk
[2]);
423 cmx
= _mm_aesenc_si128(cmx
, sk
[3]);
424 cmx
= _mm_aesenc_si128(cmx
, sk
[4]);
425 cmx
= _mm_aesenc_si128(cmx
, sk
[5]);
426 cmx
= _mm_aesenc_si128(cmx
, sk
[6]);
427 cmx
= _mm_aesenc_si128(cmx
, sk
[7]);
428 cmx
= _mm_aesenc_si128(cmx
, sk
[8]);
429 cmx
= _mm_aesenc_si128(cmx
, sk
[9]);
430 if (num_rounds
== 10) {
431 cmx
= _mm_aesenclast_si128(cmx
, sk
[10]);
432 } else if (num_rounds
== 12) {
433 cmx
= _mm_aesenc_si128(cmx
, sk
[10]);
434 cmx
= _mm_aesenc_si128(cmx
, sk
[11]);
435 cmx
= _mm_aesenclast_si128(cmx
, sk
[12]);
437 cmx
= _mm_aesenc_si128(cmx
, sk
[10]);
438 cmx
= _mm_aesenc_si128(cmx
, sk
[11]);
439 cmx
= _mm_aesenc_si128(cmx
, sk
[12]);
440 cmx
= _mm_aesenc_si128(cmx
, sk
[13]);
441 cmx
= _mm_aesenclast_si128(cmx
, sk
[14]);
448 * Write back new counter value and CBC-MAC value.
450 _mm_storeu_si128(ctr
, _mm_shuffle_epi8(ivx
, erev
));
451 _mm_storeu_si128(cbcmac
, cmx
);
454 /* see bearssl_block.h */
455 BR_TARGET("sse2,sse4.1,aes")
457 br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys
*ctx
,
458 void *ctr
, void *cbcmac
, void *data
, size_t len
)
464 __m128i erev
, zero
, one
;
467 num_rounds
= ctx
->num_rounds
;
468 for (u
= 0; u
<= num_rounds
; u
++) {
469 sk
[u
] = _mm_loadu_si128((void *)(ctx
->skey
.skni
+ (u
<< 4)));
473 * Some SSE2 constants.
475 erev
= _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
476 8, 9, 10, 11, 12, 13, 14, 15);
477 zero
= _mm_setzero_si128();
478 one
= _mm_set_epi64x(0, 1);
481 * Decode the counter in big-endian.
483 ivx
= _mm_shuffle_epi8(_mm_loadu_si128(ctr
), erev
);
484 cmx
= _mm_loadu_si128(cbcmac
);
491 * Load initial values:
492 * dx encrypted block of data
493 * x0 counter (for CTR encryption)
494 * x1 input for CBC-MAC
496 dx
= _mm_loadu_si128((void *)buf
);
497 x0
= _mm_shuffle_epi8(ivx
, erev
);
498 x1
= _mm_xor_si128(cmx
, dx
);
500 x0
= _mm_xor_si128(x0
, sk
[0]);
501 x1
= _mm_xor_si128(x1
, sk
[0]);
502 x0
= _mm_aesenc_si128(x0
, sk
[1]);
503 x1
= _mm_aesenc_si128(x1
, sk
[1]);
504 x0
= _mm_aesenc_si128(x0
, sk
[2]);
505 x1
= _mm_aesenc_si128(x1
, sk
[2]);
506 x0
= _mm_aesenc_si128(x0
, sk
[3]);
507 x1
= _mm_aesenc_si128(x1
, sk
[3]);
508 x0
= _mm_aesenc_si128(x0
, sk
[4]);
509 x1
= _mm_aesenc_si128(x1
, sk
[4]);
510 x0
= _mm_aesenc_si128(x0
, sk
[5]);
511 x1
= _mm_aesenc_si128(x1
, sk
[5]);
512 x0
= _mm_aesenc_si128(x0
, sk
[6]);
513 x1
= _mm_aesenc_si128(x1
, sk
[6]);
514 x0
= _mm_aesenc_si128(x0
, sk
[7]);
515 x1
= _mm_aesenc_si128(x1
, sk
[7]);
516 x0
= _mm_aesenc_si128(x0
, sk
[8]);
517 x1
= _mm_aesenc_si128(x1
, sk
[8]);
518 x0
= _mm_aesenc_si128(x0
, sk
[9]);
519 x1
= _mm_aesenc_si128(x1
, sk
[9]);
520 if (num_rounds
== 10) {
521 x0
= _mm_aesenclast_si128(x0
, sk
[10]);
522 x1
= _mm_aesenclast_si128(x1
, sk
[10]);
523 } else if (num_rounds
== 12) {
524 x0
= _mm_aesenc_si128(x0
, sk
[10]);
525 x1
= _mm_aesenc_si128(x1
, sk
[10]);
526 x0
= _mm_aesenc_si128(x0
, sk
[11]);
527 x1
= _mm_aesenc_si128(x1
, sk
[11]);
528 x0
= _mm_aesenclast_si128(x0
, sk
[12]);
529 x1
= _mm_aesenclast_si128(x1
, sk
[12]);
531 x0
= _mm_aesenc_si128(x0
, sk
[10]);
532 x1
= _mm_aesenc_si128(x1
, sk
[10]);
533 x0
= _mm_aesenc_si128(x0
, sk
[11]);
534 x1
= _mm_aesenc_si128(x1
, sk
[11]);
535 x0
= _mm_aesenc_si128(x0
, sk
[12]);
536 x1
= _mm_aesenc_si128(x1
, sk
[12]);
537 x0
= _mm_aesenc_si128(x0
, sk
[13]);
538 x1
= _mm_aesenc_si128(x1
, sk
[13]);
539 x0
= _mm_aesenclast_si128(x0
, sk
[14]);
540 x1
= _mm_aesenclast_si128(x1
, sk
[14]);
542 x0
= _mm_xor_si128(x0
, dx
);
544 _mm_storeu_si128((void *)buf
, x0
);
550 * Increment the counter value.
552 ivx
= _mm_add_epi64(ivx
, one
);
553 ivx
= _mm_sub_epi64(ivx
,
554 _mm_slli_si128(_mm_cmpeq_epi64(ivx
, zero
), 8));
558 * Write back new counter value and CBC-MAC value.
560 _mm_storeu_si128(ctr
, _mm_shuffle_epi8(ivx
, erev
));
561 _mm_storeu_si128(cbcmac
, cmx
);
566 /* see bearssl_block.h */
567 const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable
= {
568 sizeof(br_aes_x86ni_ctrcbc_keys
),
571 (void (*)(const br_block_ctrcbc_class
**, const void *, size_t))
572 &br_aes_x86ni_ctrcbc_init
,
573 (void (*)(const br_block_ctrcbc_class
*const *,
574 void *, void *, void *, size_t))
575 &br_aes_x86ni_ctrcbc_encrypt
,
576 (void (*)(const br_block_ctrcbc_class
*const *,
577 void *, void *, void *, size_t))
578 &br_aes_x86ni_ctrcbc_decrypt
,
579 (void (*)(const br_block_ctrcbc_class
*const *,
580 void *, void *, size_t))
581 &br_aes_x86ni_ctrcbc_ctr
,
582 (void (*)(const br_block_ctrcbc_class
*const *,
583 void *, const void *, size_t))
584 &br_aes_x86ni_ctrcbc_mac
589 /* see bearssl_block.h */
590 const br_block_ctrcbc_class
*
591 br_aes_x86ni_ctrcbc_get_vtable(void)