2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 #define BR_POWER_ASM_MACROS 1
29 * This code contains the AES key schedule implementation using the
36 key_schedule_128(unsigned char *sk
, const unsigned char *key
)
40 static const uint32_t fmod
[] = { 0x11B, 0x11B, 0x11B, 0x11B };
42 static const uint32_t idx2be
[] = {
43 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
50 * We use the VSX instructions for loading and storing the
51 * key/subkeys, since they support unaligned accesses. The rest
52 * of the computation is VMX only. VMX register 0 is VSX
59 * v1 = constant -8 / +8, copied into four words
61 * v3 = Rcon (x4 words)
62 * v6 = constant 8, copied into four words
63 * v7 = constant 0x11B, copied into four words
64 * v8 = constant for byteswapping words
75 lxvw4x(39, 0, %[fmod
])
77 lxvw4x(40, 0, %[idx2be
])
81 * First subkey is a copy of the key itself.
91 * Loop must run 10 times.
96 /* Increment subkey address */
97 addi(%[sk
], %[sk
], 16)
99 /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */
110 /* XOR words for next subkey */
119 /* Store next subkey */
122 stxvw4x(36, 0, %[sk
])
124 stxvw4x(34, 0, %[sk
])
136 : [sk
] "+b" (sk
), [cc
] "+b" (cc
)
137 : [key
] "b" (key
), [fmod
] "b" (fmod
)
139 , [idx2be
] "b" (idx2be
)
141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
146 key_schedule_192(unsigned char *sk
, const unsigned char *key
)
151 static const uint32_t idx2be
[] = {
152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
159 * We use the VSX instructions for loading and storing the
160 * key/subkeys, since they support unaligned accesses. The rest
161 * of the computation is VMX only. VMX register 0 is VSX
168 * v1 = constant -8 / +8, copied into four words
169 * v2, v3 = current subkey
170 * v5 = Rcon (x4 words) (already shifted on big-endian)
171 * v6 = constant 8, copied into four words
172 * v8 = constant for byteswapping words
174 * The left two words of v3 are ignored.
183 lxvw4x(34, 0, %[key
])
184 lxvw4x(35, %[cc
], %[key
])
192 lxvw4x(40, 0, %[idx2be
])
196 * Loop must run 8 times. Each iteration produces 256
197 * bits of subkeys, with a 64-bit overlap.
205 * Last 6 words in v2:v3l. Compute next 6 words into
216 vsldoi(13, 0, 12, 12)
218 vsldoi(13, 0, 12, 12)
234 * Update Rcon. Since for a 192-bit key, we use only 8
235 * such constants, we will not hit the field modulus,
236 * so a simple shift (addition) works well.
241 * Write out the two left 128-bit words
246 stxvw4x(42, 0, %[sk
])
247 stxvw4x(43, %[cc
], %[sk
])
249 stxvw4x(34, 0, %[sk
])
250 stxvw4x(35, %[cc
], %[sk
])
252 addi(%[sk
], %[sk
], 24)
255 * Shift words for next iteration.
263 * The loop wrote the first 50 subkey words, but we need
264 * to produce 52, so we must do one last write.
268 stxvw4x(42, 0, %[sk
])
270 stxvw4x(34, 0, %[sk
])
273 : [sk
] "+b" (sk
), [cc
] "+b" (cc
)
276 , [idx2be
] "b" (idx2be
)
278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
284 key_schedule_256(unsigned char *sk
, const unsigned char *key
)
289 static const uint32_t idx2be
[] = {
290 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
297 * We use the VSX instructions for loading and storing the
298 * key/subkeys, since they support unaligned accesses. The rest
299 * of the computation is VMX only. VMX register 0 is VSX
306 * v1 = constant -8 / +8, copied into four words
307 * v2, v3 = current subkey
308 * v6 = Rcon (x4 words) (already shifted on big-endian)
309 * v7 = constant 8, copied into four words
310 * v8 = constant for byteswapping words
312 * The left two words of v3 are ignored.
321 lxvw4x(34, 0, %[key
])
322 lxvw4x(35, %[cc
], %[key
])
329 lxvw4x(40, 0, %[idx2be
])
333 * Loop must run 7 times. Each iteration produces two
342 * Current words are in v2:v3. Compute next word in v4.
358 * Then other word in v5.
372 * Update Rcon. Since for a 256-bit key, we use only 7
373 * such constants, we will not hit the field modulus,
374 * so a simple shift (addition) works well.
379 * Write out the two left 128-bit words
384 stxvw4x(42, 0, %[sk
])
385 stxvw4x(43, %[cc
], %[sk
])
387 stxvw4x(34, 0, %[sk
])
388 stxvw4x(35, %[cc
], %[sk
])
390 addi(%[sk
], %[sk
], 32)
393 * Replace v2:v3 with v4:v5.
401 * The loop wrote the first 14 subkeys, but we need 15,
402 * so we must do an extra write.
406 stxvw4x(42, 0, %[sk
])
408 stxvw4x(34, 0, %[sk
])
411 : [sk
] "+b" (sk
), [cc
] "+b" (cc
)
414 , [idx2be
] "b" (idx2be
)
416 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
423 br_aes_pwr8_supported(void)
430 br_aes_pwr8_keysched(unsigned char *sk
, const void *key
, size_t len
)
434 key_schedule_128(sk
, key
);
437 key_schedule_192(sk
, key
);
440 key_schedule_256(sk
, key
);