_ Git - BearSSL/blob - src/symcipher/aes_pwr8.c

   1 /*
   2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining
   5  * a copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sublicense, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be
  13  * included in all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #define BR_POWER_ASM_MACROS   1
  26 #include "inner.h"
  27
  28 /*
  29  * This code contains the AES key schedule implementation using the
  30  * POWER8 opcodes.
  31  */
  32
  33 #if BR_POWER8
  34
  35 static void
  36 key_schedule_128(unsigned char *sk, const unsigned char *key)
  37 {
  38         long cc;
  39
  40         static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
  41 #if BR_POWER8_LE
  42         static const uint32_t idx2be[] = {
  43                 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
  44         };
  45 #endif
  46
  47         cc = 0;
  48
  49         /*
  50          * We use the VSX instructions for loading and storing the
  51          * key/subkeys, since they support unaligned accesses. The rest
  52          * of the computation is VMX only. VMX register 0 is VSX
  53          * register 32.
  54          */
  55         asm volatile (
  56
  57                 /*
  58                  * v0 = all-zero word
  59                  * v1 = constant -8 / +8, copied into four words
  60                  * v2 = current subkey
  61                  * v3 = Rcon (x4 words)
  62                  * v6 = constant 8, copied into four words
  63                  * v7 = constant 0x11B, copied into four words
  64                  * v8 = constant for byteswapping words
  65                  */
  66                 vspltisw(0, 0)
  67 #if BR_POWER8_LE
  68                 vspltisw(1, -8)
  69 #else
  70                 vspltisw(1, 8)
  71 #endif
  72                 lxvw4x(34, 0, %[key])
  73                 vspltisw(3, 1)
  74                 vspltisw(6, 8)
  75                 lxvw4x(39, 0, %[fmod])
  76 #if BR_POWER8_LE
  77                 lxvw4x(40, 0, %[idx2be])
  78 #endif
  79
  80                 /*
  81                  * First subkey is a copy of the key itself.
  82                  */
  83 #if BR_POWER8_LE
  84                 vperm(4, 2, 2, 8)
  85                 stxvw4x(36, 0, %[sk])
  86 #else
  87                 stxvw4x(34, 0, %[sk])
  88 #endif
  89
  90                 /*
  91                  * Loop must run 10 times.
  92                  */
  93                 li(%[cc], 10)
  94                 mtctr(%[cc])
  95         label(loop)
  96                 /* Increment subkey address */
  97                 addi(%[sk], %[sk], 16)
  98
  99                 /* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
 100                 vrlw(4, 2, 1)
 101                 vsbox(4, 4)
 102 #if BR_POWER8_LE
 103                 vxor(4, 4, 3)
 104 #else
 105                 vsldoi(5, 3, 0, 3)
 106                 vxor(4, 4, 5)
 107 #endif
 108                 vspltw(4, 4, 3)
 109
 110                 /* XOR words for next subkey */
 111                 vsldoi(5, 0, 2, 12)
 112                 vxor(2, 2, 5)
 113                 vsldoi(5, 0, 2, 12)
 114                 vxor(2, 2, 5)
 115                 vsldoi(5, 0, 2, 12)
 116                 vxor(2, 2, 5)
 117                 vxor(2, 2, 4)
 118
 119                 /* Store next subkey */
 120 #if BR_POWER8_LE
 121                 vperm(4, 2, 2, 8)
 122                 stxvw4x(36, 0, %[sk])
 123 #else
 124                 stxvw4x(34, 0, %[sk])
 125 #endif
 126
 127                 /* Update Rcon */
 128                 vadduwm(3, 3, 3)
 129                 vsrw(4, 3, 6)
 130                 vsubuwm(4, 0, 4)
 131                 vand(4, 4, 7)
 132                 vxor(3, 3, 4)
 133
 134                 bdnz(loop)
 135
 136 : [sk] "+b" (sk), [cc] "+b" (cc)
 137 : [key] "b" (key), [fmod] "b" (fmod)
 138 #if BR_POWER8_LE
 139         , [idx2be] "b" (idx2be)
 140 #endif
 141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
 142         );
 143 }
 144
 145 static void
 146 key_schedule_192(unsigned char *sk, const unsigned char *key)
 147 {
 148         long cc;
 149
 150 #if BR_POWER8_LE
 151         static const uint32_t idx2be[] = {
 152                 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
 153         };
 154 #endif
 155
 156         cc = 0;
 157
 158         /*
 159          * We use the VSX instructions for loading and storing the
 160          * key/subkeys, since they support unaligned accesses. The rest
 161          * of the computation is VMX only. VMX register 0 is VSX
 162          * register 32.
 163          */
 164         asm volatile (
 165
 166                 /*
 167                  * v0 = all-zero word
 168                  * v1 = constant -8 / +8, copied into four words
 169                  * v2, v3 = current subkey
 170                  * v5 = Rcon (x4 words) (already shifted on big-endian)
 171                  * v6 = constant 8, copied into four words
 172                  * v8 = constant for byteswapping words
 173                  *
 174                  * The left two words of v3 are ignored.
 175                  */
 176                 vspltisw(0, 0)
 177 #if BR_POWER8_LE
 178                 vspltisw(1, -8)
 179 #else
 180                 vspltisw(1, 8)
 181 #endif
 182                 li(%[cc], 8)
 183                 lxvw4x(34, 0, %[key])
 184                 lxvw4x(35, %[cc], %[key])
 185                 vsldoi(3, 3, 0, 8)
 186                 vspltisw(5, 1)
 187 #if !BR_POWER8_LE
 188                 vsldoi(5, 5, 0, 3)
 189 #endif
 190                 vspltisw(6, 8)
 191 #if BR_POWER8_LE
 192                 lxvw4x(40, 0, %[idx2be])
 193 #endif
 194
 195                 /*
 196                  * Loop must run 8 times. Each iteration produces 256
 197                  * bits of subkeys, with a 64-bit overlap.
 198                  */
 199                 li(%[cc], 8)
 200                 mtctr(%[cc])
 201                 li(%[cc], 16)
 202         label(loop)
 203
 204                 /*
 205                  * Last 6 words in v2:v3l. Compute next 6 words into
 206                  * v3r:v4.
 207                  */
 208                 vrlw(10, 3, 1)
 209                 vsbox(10, 10)
 210                 vxor(10, 10, 5)
 211                 vspltw(10, 10, 1)
 212                 vsldoi(11, 0, 10, 8)
 213
 214                 vsldoi(12, 0, 2, 12)
 215                 vxor(12, 2, 12)
 216                 vsldoi(13, 0, 12, 12)
 217                 vxor(12, 12, 13)
 218                 vsldoi(13, 0, 12, 12)
 219                 vxor(12, 12, 13)
 220
 221                 vspltw(13, 12, 3)
 222                 vxor(13, 13, 3)
 223                 vsldoi(14, 0, 3, 12)
 224                 vxor(13, 13, 14)
 225
 226                 vsldoi(4, 12, 13, 8)
 227                 vsldoi(14, 0, 3, 8)
 228                 vsldoi(3, 14, 12, 8)
 229
 230                 vxor(3, 3, 11)
 231                 vxor(4, 4, 10)
 232
 233                 /*
 234                  * Update Rcon. Since for a 192-bit key, we use only 8
 235                  * such constants, we will not hit the field modulus,
 236                  * so a simple shift (addition) works well.
 237                  */
 238                 vadduwm(5, 5, 5)
 239
 240                 /*
 241                  * Write out the two left 128-bit words
 242                  */
 243 #if BR_POWER8_LE
 244                 vperm(10, 2, 2, 8)
 245                 vperm(11, 3, 3, 8)
 246                 stxvw4x(42, 0, %[sk])
 247                 stxvw4x(43, %[cc], %[sk])
 248 #else
 249                 stxvw4x(34, 0, %[sk])
 250                 stxvw4x(35, %[cc], %[sk])
 251 #endif
 252                 addi(%[sk], %[sk], 24)
 253
 254                 /*
 255                  * Shift words for next iteration.
 256                  */
 257                 vsldoi(2, 3, 4, 8)
 258                 vsldoi(3, 4, 0, 8)
 259
 260                 bdnz(loop)
 261
 262                 /*
 263                  * The loop wrote the first 50 subkey words, but we need
 264                  * to produce 52, so we must do one last write.
 265                  */
 266 #if BR_POWER8_LE
 267                 vperm(10, 2, 2, 8)
 268                 stxvw4x(42, 0, %[sk])
 269 #else
 270                 stxvw4x(34, 0, %[sk])
 271 #endif
 272
 273 : [sk] "+b" (sk), [cc] "+b" (cc)
 274 : [key] "b" (key)
 275 #if BR_POWER8_LE
 276         , [idx2be] "b" (idx2be)
 277 #endif
 278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
 279   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
 280         );
 281 }
 282
 283 static void
 284 key_schedule_256(unsigned char *sk, const unsigned char *key)
 285 {
 286         long cc;
 287
 288 #if BR_POWER8_LE
 289         static const uint32_t idx2be[] = {
 290                 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
 291         };
 292 #endif
 293
 294         cc = 0;
 295
 296         /*
 297          * We use the VSX instructions for loading and storing the
 298          * key/subkeys, since they support unaligned accesses. The rest
 299          * of the computation is VMX only. VMX register 0 is VSX
 300          * register 32.
 301          */
 302         asm volatile (
 303
 304                 /*
 305                  * v0 = all-zero word
 306                  * v1 = constant -8 / +8, copied into four words
 307                  * v2, v3 = current subkey
 308                  * v6 = Rcon (x4 words) (already shifted on big-endian)
 309                  * v7 = constant 8, copied into four words
 310                  * v8 = constant for byteswapping words
 311                  *
 312                  * The left two words of v3 are ignored.
 313                  */
 314                 vspltisw(0, 0)
 315 #if BR_POWER8_LE
 316                 vspltisw(1, -8)
 317 #else
 318                 vspltisw(1, 8)
 319 #endif
 320                 li(%[cc], 16)
 321                 lxvw4x(34, 0, %[key])
 322                 lxvw4x(35, %[cc], %[key])
 323                 vspltisw(6, 1)
 324 #if !BR_POWER8_LE
 325                 vsldoi(6, 6, 0, 3)
 326 #endif
 327                 vspltisw(7, 8)
 328 #if BR_POWER8_LE
 329                 lxvw4x(40, 0, %[idx2be])
 330 #endif
 331
 332                 /*
 333                  * Loop must run 7 times. Each iteration produces two
 334                  * subkeys.
 335                  */
 336                 li(%[cc], 7)
 337                 mtctr(%[cc])
 338                 li(%[cc], 16)
 339         label(loop)
 340
 341                 /*
 342                  * Current words are in v2:v3. Compute next word in v4.
 343                  */
 344                 vrlw(10, 3, 1)
 345                 vsbox(10, 10)
 346                 vxor(10, 10, 6)
 347                 vspltw(10, 10, 3)
 348
 349                 vsldoi(4, 0, 2, 12)
 350                 vxor(4, 2, 4)
 351                 vsldoi(5, 0, 4, 12)
 352                 vxor(4, 4, 5)
 353                 vsldoi(5, 0, 4, 12)
 354                 vxor(4, 4, 5)
 355                 vxor(4, 4, 10)
 356
 357                 /*
 358                  * Then other word in v5.
 359                  */
 360                 vsbox(10, 4)
 361                 vspltw(10, 10, 3)
 362
 363                 vsldoi(5, 0, 3, 12)
 364                 vxor(5, 3, 5)
 365                 vsldoi(11, 0, 5, 12)
 366                 vxor(5, 5, 11)
 367                 vsldoi(11, 0, 5, 12)
 368                 vxor(5, 5, 11)
 369                 vxor(5, 5, 10)
 370
 371                 /*
 372                  * Update Rcon. Since for a 256-bit key, we use only 7
 373                  * such constants, we will not hit the field modulus,
 374                  * so a simple shift (addition) works well.
 375                  */
 376                 vadduwm(6, 6, 6)
 377
 378                 /*
 379                  * Write out the two left 128-bit words
 380                  */
 381 #if BR_POWER8_LE
 382                 vperm(10, 2, 2, 8)
 383                 vperm(11, 3, 3, 8)
 384                 stxvw4x(42, 0, %[sk])
 385                 stxvw4x(43, %[cc], %[sk])
 386 #else
 387                 stxvw4x(34, 0, %[sk])
 388                 stxvw4x(35, %[cc], %[sk])
 389 #endif
 390                 addi(%[sk], %[sk], 32)
 391
 392                 /*
 393                  * Replace v2:v3 with v4:v5.
 394                  */
 395                 vxor(2, 0, 4)
 396                 vxor(3, 0, 5)
 397
 398                 bdnz(loop)
 399
 400                 /*
 401                  * The loop wrote the first 14 subkeys, but we need 15,
 402                  * so we must do an extra write.
 403                  */
 404 #if BR_POWER8_LE
 405                 vperm(10, 2, 2, 8)
 406                 stxvw4x(42, 0, %[sk])
 407 #else
 408                 stxvw4x(34, 0, %[sk])
 409 #endif
 410
 411 : [sk] "+b" (sk), [cc] "+b" (cc)
 412 : [key] "b" (key)
 413 #if BR_POWER8_LE
 414         , [idx2be] "b" (idx2be)
 415 #endif
 416 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
 417   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
 418         );
 419 }
 420
 421 /* see inner.h */
 422 int
 423 br_aes_pwr8_supported(void)
 424 {
 425         return 1;
 426 }
 427
 428 /* see inner.h */
 429 unsigned
 430 br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
 431 {
 432         switch (len) {
 433         case 16:
 434                 key_schedule_128(sk, key);
 435                 return 10;
 436         case 24:
 437                 key_schedule_192(sk, key);
 438                 return 12;
 439         default:
 440                 key_schedule_256(sk, key);
 441                 return 14;
 442         }
 443 }
 444
 445 #endif