From: Thomas Pornin <pornin@bolet.org>
Date: Wed, 15 Feb 2017 14:08:37 +0000 (+0000)
Subject: New AES and GHASH implementations using POWER8 crypto opcodes.
X-Git-Tag: v0.4~9
X-Git-Url: https://www.bearssl.org/gitweb//home/git/?p=BearSSL;a=commitdiff_plain;h=db8f1b664524e3fbeea8a0730b2bbe2f0bdcea86

New AES and GHASH implementations using POWER8 crypto opcodes.
---

diff --git a/inc/bearssl_block.h b/inc/bearssl_block.h
index c6f20f5..88f51b2 100644
--- a/inc/bearssl_block.h
+++ b/inc/bearssl_block.h
@@ -192,6 +192,7 @@
  * | aes_ct    | AES      |        16          | 16, 24 and 32       |
  * | aes_ct64  | AES      |        16          | 16, 24 and 32       |
  * | aes_x86ni | AES      |        16          | 16, 24 and 32       |
+ * | aes_pwr8  | AES      |        16          | 16, 24 and 32       |
  * | des_ct    | DES/3DES |         8          | 8, 16 and 24        |
  * | des_tab   | DES/3DES |         8          | 8, 16 and 24        |
  *
@@ -225,9 +226,11 @@
  * operations (i.e. CTR, and CBC decryption, but not CBC encryption).
  *
  * `aes_x86ni` exists only on x86 architectures (32-bit and 64-bit). It
- * uses the AES-NI opcodes when available; if the opcodes are not present,
- * then it automatically fall backs on an appropriate constant-time
- * implementation (`aes_ct` for 32-bit, `aes_ct64` for 64-bit).
+ * uses the AES-NI opcodes when available.
+ *
+ * `aes_pwr8` exists only on PowerPC / POWER architectures (32-bit and
+ * 64-bit, both little-endian and big-endian). It uses the AES opcodes
+ * present in POWER8 and later.
  *
  * `des_tab` is a classic, table-based implementation of DES/3DES. It
  * is not constant-time.
@@ -1001,9 +1004,7 @@ uint32_t br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx,
 	const void *iv, uint32_t cc, void *data, size_t len);
 
 /*
- * AES implementation using AES-NI opcode (x86 platform). When the
- * opcodes are not present, this falls back to "ct" or "ct64" (depending
- * on architecture).
+ * AES implementation using AES-NI opcodes (x86 platform).
  */
 
 /** \brief AES block size (16 bytes). */
@@ -1021,12 +1022,6 @@ typedef struct {
 #ifndef BR_DOXYGEN_IGNORE
 	union {
 		unsigned char skni[16 * 15];
-		struct {
-			uint32_t skey[60];
-		} fallback_ct;
-		struct {
-			uint64_t skey[30];
-		} fallback_ct64;
 	} skey;
 	unsigned num_rounds;
 #endif
@@ -1044,12 +1039,6 @@ typedef struct {
 #ifndef BR_DOXYGEN_IGNORE
 	union {
 		unsigned char skni[16 * 15];
-		struct {
-			uint32_t skey[60];
-		} fallback_ct;
-		struct {
-			uint64_t skey[30];
-		} fallback_ct64;
 	} skey;
 	unsigned num_rounds;
 #endif
@@ -1068,12 +1057,6 @@ typedef struct {
 #ifndef BR_DOXYGEN_IGNORE
 	union {
 		unsigned char skni[16 * 15];
-		struct {
-			uint32_t skey[60];
-		} fallback_ct;
-		struct {
-			uint64_t skey[30];
-		} fallback_ct64;
 	} skey;
 	unsigned num_rounds;
 #endif
@@ -1213,6 +1196,199 @@ const br_block_cbcdec_class *br_aes_x86ni_cbcdec_get_vtable(void);
  */
 const br_block_ctr_class *br_aes_x86ni_ctr_get_vtable(void);
 
+/*
+ * AES implementation using POWER8 opcodes.
+ */
+
+/** \brief AES block size (16 bytes). */
+#define br_aes_pwr8_BLOCK_SIZE   16
+
+/**
+ * \brief Context for AES subkeys (`aes_pwr8` implementation, CBC encryption).
+ *
+ * First field is a pointer to the vtable; it is set by the initialisation
+ * function. Other fields are not supposed to be accessed by user code.
+ */
+typedef struct {
+	/** \brief Pointer to vtable for this context. */
+	const br_block_cbcenc_class *vtable;
+#ifndef BR_DOXYGEN_IGNORE
+	union {
+		unsigned char skni[16 * 15];
+	} skey;
+	unsigned num_rounds;
+#endif
+} br_aes_pwr8_cbcenc_keys;
+
+/**
+ * \brief Context for AES subkeys (`aes_pwr8` implementation, CBC decryption).
+ *
+ * First field is a pointer to the vtable; it is set by the initialisation
+ * function. Other fields are not supposed to be accessed by user code.
+ */
+typedef struct {
+	/** \brief Pointer to vtable for this context. */
+	const br_block_cbcdec_class *vtable;
+#ifndef BR_DOXYGEN_IGNORE
+	union {
+		unsigned char skni[16 * 15];
+	} skey;
+	unsigned num_rounds;
+#endif
+} br_aes_pwr8_cbcdec_keys;
+
+/**
+ * \brief Context for AES subkeys (`aes_pwr8` implementation, CTR encryption
+ * and decryption).
+ *
+ * First field is a pointer to the vtable; it is set by the initialisation
+ * function. Other fields are not supposed to be accessed by user code.
+ */
+typedef struct {
+	/** \brief Pointer to vtable for this context. */
+	const br_block_ctr_class *vtable;
+#ifndef BR_DOXYGEN_IGNORE
+	union {
+		unsigned char skni[16 * 15];
+	} skey;
+	unsigned num_rounds;
+#endif
+} br_aes_pwr8_ctr_keys;
+
+/**
+ * \brief Class instance for AES CBC encryption (`aes_pwr8` implementation).
+ *
+ * Since this implementation might be omitted from the library, or the
+ * AES opcode unavailable on the current CPU, a pointer to this class
+ * instance should be obtained through `br_aes_pwr8_cbcenc_get_vtable()`.
+ */
+extern const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable;
+
+/**
+ * \brief Class instance for AES CBC decryption (`aes_pwr8` implementation).
+ *
+ * Since this implementation might be omitted from the library, or the
+ * AES opcode unavailable on the current CPU, a pointer to this class
+ * instance should be obtained through `br_aes_pwr8_cbcdec_get_vtable()`.
+ */
+extern const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable;
+
+/**
+ * \brief Class instance for AES CTR encryption and decryption
+ * (`aes_pwr8` implementation).
+ *
+ * Since this implementation might be omitted from the library, or the
+ * AES opcode unavailable on the current CPU, a pointer to this class
+ * instance should be obtained through `br_aes_pwr8_ctr_get_vtable()`.
+ */
+extern const br_block_ctr_class br_aes_pwr8_ctr_vtable;
+
+/**
+ * \brief Context initialisation (key schedule) for AES CBC encryption
+ * (`aes_pwr8` implementation).
+ *
+ * \param ctx   context to initialise.
+ * \param key   secret key.
+ * \param len   secret key length (in bytes).
+ */
+void br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx,
+	const void *key, size_t len);
+
+/**
+ * \brief Context initialisation (key schedule) for AES CBC decryption
+ * (`aes_pwr8` implementation).
+ *
+ * \param ctx   context to initialise.
+ * \param key   secret key.
+ * \param len   secret key length (in bytes).
+ */
+void br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx,
+	const void *key, size_t len);
+
+/**
+ * \brief Context initialisation (key schedule) for AES CTR encryption
+ * and decryption (`aes_pwr8` implementation).
+ *
+ * \param ctx   context to initialise.
+ * \param key   secret key.
+ * \param len   secret key length (in bytes).
+ */
+void br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx,
+	const void *key, size_t len);
+
+/**
+ * \brief CBC encryption with AES (`aes_pwr8` implementation).
+ *
+ * \param ctx    context (already initialised).
+ * \param iv     IV (updated).
+ * \param data   data to encrypt (updated).
+ * \param len    data length (in bytes, MUST be multiple of 16).
+ */
+void br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx, void *iv,
+	void *data, size_t len);
+
+/**
+ * \brief CBC decryption with AES (`aes_pwr8` implementation).
+ *
+ * \param ctx    context (already initialised).
+ * \param iv     IV (updated).
+ * \param data   data to decrypt (updated).
+ * \param len    data length (in bytes, MUST be multiple of 16).
+ */
+void br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx, void *iv,
+	void *data, size_t len);
+
+/**
+ * \brief CTR encryption and decryption with AES (`aes_pwr8` implementation).
+ *
+ * \param ctx    context (already initialised).
+ * \param iv     IV (constant, 12 bytes).
+ * \param cc     initial block counter value.
+ * \param data   data to decrypt (updated).
+ * \param len    data length (in bytes).
+ * \return  new block counter value.
+ */
+uint32_t br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len);
+
+/**
+ * \brief Obtain the `aes_pwr8` AES-CBC (encryption) implementation, if
+ * available.
+ *
+ * This function returns a pointer to `br_aes_pwr8_cbcenc_vtable`, if
+ * that implementation was compiled in the library _and_ the x86 AES
+ * opcodes are available on the currently running CPU. If either of
+ * these conditions is not met, then this function returns `NULL`.
+ *
+ * \return  the `aes_x868ni` AES-CBC (encryption) implementation, or `NULL`.
+ */
+const br_block_cbcenc_class *br_aes_pwr8_cbcenc_get_vtable(void);
+
+/**
+ * \brief Obtain the `aes_pwr8` AES-CBC (decryption) implementation, if
+ * available.
+ *
+ * This function returns a pointer to `br_aes_pwr8_cbcdec_vtable`, if
+ * that implementation was compiled in the library _and_ the x86 AES
+ * opcodes are available on the currently running CPU. If either of
+ * these conditions is not met, then this function returns `NULL`.
+ *
+ * \return  the `aes_x868ni` AES-CBC (decryption) implementation, or `NULL`.
+ */
+const br_block_cbcdec_class *br_aes_pwr8_cbcdec_get_vtable(void);
+
+/**
+ * \brief Obtain the `aes_pwr8` AES-CTR implementation, if available.
+ *
+ * This function returns a pointer to `br_aes_pwr8_ctr_vtable`, if
+ * that implementation was compiled in the library _and_ the x86 AES
+ * opcodes are available on the currently running CPU. If either of
+ * these conditions is not met, then this function returns `NULL`.
+ *
+ * \return  the `aes_x868ni` AES-CTR implementation, or `NULL`.
+ */
+const br_block_ctr_class *br_aes_pwr8_ctr_get_vtable(void);
+
 /**
  * \brief Aggregate structure large enough to be used as context for
  * subkeys (CBC encryption) for all AES implementations.
@@ -1224,6 +1400,7 @@ typedef union {
 	br_aes_ct_cbcenc_keys c_ct;
 	br_aes_ct64_cbcenc_keys c_ct64;
 	br_aes_x86ni_cbcenc_keys c_x86ni;
+	br_aes_pwr8_cbcenc_keys c_pwr8;
 } br_aes_gen_cbcenc_keys;
 
 /**
@@ -1237,6 +1414,7 @@ typedef union {
 	br_aes_ct_cbcdec_keys c_ct;
 	br_aes_ct64_cbcdec_keys c_ct64;
 	br_aes_x86ni_cbcdec_keys c_x86ni;
+	br_aes_pwr8_cbcdec_keys c_pwr8;
 } br_aes_gen_cbcdec_keys;
 
 /**
@@ -1250,6 +1428,7 @@ typedef union {
 	br_aes_ct_ctr_keys c_ct;
 	br_aes_ct64_ctr_keys c_ct64;
 	br_aes_x86ni_ctr_keys c_x86ni;
+	br_aes_pwr8_ctr_keys c_pwr8;
 } br_aes_gen_ctr_keys;
 
 /*
diff --git a/inc/bearssl_hash.h b/inc/bearssl_hash.h
index 524ac01..d06bae4 100644
--- a/inc/bearssl_hash.h
+++ b/inc/bearssl_hash.h
@@ -1309,4 +1309,30 @@ void br_ghash_pclmul(void *y, const void *h, const void *data, size_t len);
  */
 br_ghash br_ghash_pclmul_get(void);
 
+/**
+ * \brief GHASH implementation using the POWER8 opcodes.
+ *
+ * This implementation is available only on POWER8 platforms (and later).
+ * To safely obtain a pointer to this function when supported (or 0
+ * otherwise), use `br_ghash_pwr8_get()`.
+ *
+ * \param y      the array to update.
+ * \param h      the GHASH key.
+ * \param data   the input data (may be `NULL` if `len` is zero).
+ * \param len    the input data length (in bytes).
+ */
+void br_ghash_pwr8(void *y, const void *h, const void *data, size_t len);
+
+/**
+ * \brief Obtain the `pwr8` GHASH implementation, if available.
+ *
+ * If the `pwr8` implementation was compiled in the library (depending
+ * on the compiler abilities) _and_ the local CPU appears to support the
+ * opcode, then this function will return a pointer to the
+ * `br_ghash_pwr8()` function. Otherwise, it will return `0`.
+ *
+ * \return  the `pwr8` GHASH implementation, or `0`.
+ */
+br_ghash br_ghash_pwr8_get(void);
+
 #endif
diff --git a/mk/Rules.mk b/mk/Rules.mk
index 9e2ba07..742f35c 100644
--- a/mk/Rules.mk
+++ b/mk/Rules.mk
@@ -1,6 +1,6 @@
 # Automatically generated rules. Use 'mkrules.sh' to modify/regenerate.
 
-OBJ = $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O
+OBJ = $(OBJDIR)$Pccopy$O $(OBJDIR)$Pdec16be$O $(OBJDIR)$Pdec16le$O $(OBJDIR)$Pdec32be$O $(OBJDIR)$Pdec32le$O $(OBJDIR)$Pdec64be$O $(OBJDIR)$Pdec64le$O $(OBJDIR)$Penc16be$O $(OBJDIR)$Penc16le$O $(OBJDIR)$Penc32be$O $(OBJDIR)$Penc32le$O $(OBJDIR)$Penc64be$O $(OBJDIR)$Penc64le$O $(OBJDIR)$Ppemdec$O $(OBJDIR)$Pec_all_m15$O $(OBJDIR)$Pec_all_m31$O $(OBJDIR)$Pec_c25519_i15$O $(OBJDIR)$Pec_c25519_i31$O $(OBJDIR)$Pec_c25519_m15$O $(OBJDIR)$Pec_c25519_m31$O $(OBJDIR)$Pec_curve25519$O $(OBJDIR)$Pec_default$O $(OBJDIR)$Pec_p256_m15$O $(OBJDIR)$Pec_p256_m31$O $(OBJDIR)$Pec_prime_i15$O $(OBJDIR)$Pec_prime_i31$O $(OBJDIR)$Pec_secp256r1$O $(OBJDIR)$Pec_secp384r1$O $(OBJDIR)$Pec_secp521r1$O $(OBJDIR)$Pecdsa_atr$O $(OBJDIR)$Pecdsa_default_sign_asn1$O $(OBJDIR)$Pecdsa_default_sign_raw$O $(OBJDIR)$Pecdsa_default_vrfy_asn1$O $(OBJDIR)$Pecdsa_default_vrfy_raw$O $(OBJDIR)$Pecdsa_i15_bits$O $(OBJDIR)$Pecdsa_i15_sign_asn1$O $(OBJDIR)$Pecdsa_i15_sign_raw$O $(OBJDIR)$Pecdsa_i15_vrfy_asn1$O $(OBJDIR)$Pecdsa_i15_vrfy_raw$O $(OBJDIR)$Pecdsa_i31_bits$O $(OBJDIR)$Pecdsa_i31_sign_asn1$O $(OBJDIR)$Pecdsa_i31_sign_raw$O $(OBJDIR)$Pecdsa_i31_vrfy_asn1$O $(OBJDIR)$Pecdsa_i31_vrfy_raw$O $(OBJDIR)$Pecdsa_rta$O $(OBJDIR)$Pdig_oid$O $(OBJDIR)$Pdig_size$O $(OBJDIR)$Pghash_ctmul$O $(OBJDIR)$Pghash_ctmul32$O $(OBJDIR)$Pghash_ctmul64$O $(OBJDIR)$Pghash_pclmul$O $(OBJDIR)$Pghash_pwr8$O $(OBJDIR)$Pmd5$O $(OBJDIR)$Pmd5sha1$O $(OBJDIR)$Pmultihash$O $(OBJDIR)$Psha1$O $(OBJDIR)$Psha2big$O $(OBJDIR)$Psha2small$O $(OBJDIR)$Pi15_add$O $(OBJDIR)$Pi15_bitlen$O $(OBJDIR)$Pi15_decmod$O $(OBJDIR)$Pi15_decode$O $(OBJDIR)$Pi15_decred$O $(OBJDIR)$Pi15_encode$O $(OBJDIR)$Pi15_fmont$O $(OBJDIR)$Pi15_iszero$O $(OBJDIR)$Pi15_modpow$O $(OBJDIR)$Pi15_modpow2$O $(OBJDIR)$Pi15_montmul$O $(OBJDIR)$Pi15_mulacc$O $(OBJDIR)$Pi15_muladd$O $(OBJDIR)$Pi15_ninv15$O $(OBJDIR)$Pi15_reduce$O $(OBJDIR)$Pi15_rshift$O $(OBJDIR)$Pi15_sub$O $(OBJDIR)$Pi15_tmont$O $(OBJDIR)$Pi31_add$O $(OBJDIR)$Pi31_bitlen$O $(OBJDIR)$Pi31_decmod$O $(OBJDIR)$Pi31_decode$O $(OBJDIR)$Pi31_decred$O $(OBJDIR)$Pi31_encode$O $(OBJDIR)$Pi31_fmont$O $(OBJDIR)$Pi31_iszero$O $(OBJDIR)$Pi31_modpow$O $(OBJDIR)$Pi31_montmul$O $(OBJDIR)$Pi31_mulacc$O $(OBJDIR)$Pi31_muladd$O $(OBJDIR)$Pi31_ninv31$O $(OBJDIR)$Pi31_reduce$O $(OBJDIR)$Pi31_rshift$O $(OBJDIR)$Pi31_sub$O $(OBJDIR)$Pi31_tmont$O $(OBJDIR)$Pi32_add$O $(OBJDIR)$Pi32_bitlen$O $(OBJDIR)$Pi32_decmod$O $(OBJDIR)$Pi32_decode$O $(OBJDIR)$Pi32_decred$O $(OBJDIR)$Pi32_div32$O $(OBJDIR)$Pi32_encode$O $(OBJDIR)$Pi32_fmont$O $(OBJDIR)$Pi32_iszero$O $(OBJDIR)$Pi32_modpow$O $(OBJDIR)$Pi32_montmul$O $(OBJDIR)$Pi32_mulacc$O $(OBJDIR)$Pi32_muladd$O $(OBJDIR)$Pi32_ninv32$O $(OBJDIR)$Pi32_reduce$O $(OBJDIR)$Pi32_sub$O $(OBJDIR)$Pi32_tmont$O $(OBJDIR)$Phmac$O $(OBJDIR)$Phmac_ct$O $(OBJDIR)$Phmac_drbg$O $(OBJDIR)$Prsa_default_pkcs1_sign$O $(OBJDIR)$Prsa_default_pkcs1_vrfy$O $(OBJDIR)$Prsa_default_priv$O $(OBJDIR)$Prsa_default_pub$O $(OBJDIR)$Prsa_i15_pkcs1_sign$O $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O $(OBJDIR)$Prsa_i15_priv$O $(OBJDIR)$Prsa_i15_pub$O $(OBJDIR)$Prsa_i31_pkcs1_sign$O $(OBJDIR)$Prsa_i31_pkcs1_vrfy$O $(OBJDIR)$Prsa_i31_priv$O $(OBJDIR)$Prsa_i31_pub$O $(OBJDIR)$Prsa_i32_pkcs1_sign$O $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O $(OBJDIR)$Prsa_i32_priv$O $(OBJDIR)$Prsa_i32_pub$O $(OBJDIR)$Prsa_pkcs1_sig_pad$O $(OBJDIR)$Prsa_pkcs1_sig_unpad$O $(OBJDIR)$Prsa_ssl_decrypt$O $(OBJDIR)$Pprf$O $(OBJDIR)$Pprf_md5sha1$O $(OBJDIR)$Pprf_sha256$O $(OBJDIR)$Pprf_sha384$O $(OBJDIR)$Pssl_ccert_single_ec$O $(OBJDIR)$Pssl_ccert_single_rsa$O $(OBJDIR)$Pssl_client$O $(OBJDIR)$Pssl_client_default_rsapub$O $(OBJDIR)$Pssl_client_full$O $(OBJDIR)$Pssl_engine$O $(OBJDIR)$Pssl_engine_default_aescbc$O $(OBJDIR)$Pssl_engine_default_aesgcm$O $(OBJDIR)$Pssl_engine_default_chapol$O $(OBJDIR)$Pssl_engine_default_descbc$O $(OBJDIR)$Pssl_engine_default_ec$O $(OBJDIR)$Pssl_engine_default_ecdsa$O $(OBJDIR)$Pssl_engine_default_rsavrfy$O $(OBJDIR)$Pssl_hashes$O $(OBJDIR)$Pssl_hs_client$O $(OBJDIR)$Pssl_hs_server$O $(OBJDIR)$Pssl_io$O $(OBJDIR)$Pssl_lru$O $(OBJDIR)$Pssl_rec_cbc$O $(OBJDIR)$Pssl_rec_chapol$O $(OBJDIR)$Pssl_rec_gcm$O $(OBJDIR)$Pssl_scert_single_ec$O $(OBJDIR)$Pssl_scert_single_rsa$O $(OBJDIR)$Pssl_server$O $(OBJDIR)$Pssl_server_full_ec$O $(OBJDIR)$Pssl_server_full_rsa$O $(OBJDIR)$Pssl_server_mine2c$O $(OBJDIR)$Pssl_server_mine2g$O $(OBJDIR)$Pssl_server_minf2c$O $(OBJDIR)$Pssl_server_minf2g$O $(OBJDIR)$Pssl_server_minr2g$O $(OBJDIR)$Pssl_server_minu2g$O $(OBJDIR)$Pssl_server_minv2g$O $(OBJDIR)$Paes_big_cbcdec$O $(OBJDIR)$Paes_big_cbcenc$O $(OBJDIR)$Paes_big_ctr$O $(OBJDIR)$Paes_big_dec$O $(OBJDIR)$Paes_big_enc$O $(OBJDIR)$Paes_common$O $(OBJDIR)$Paes_ct$O $(OBJDIR)$Paes_ct64$O $(OBJDIR)$Paes_ct64_cbcdec$O $(OBJDIR)$Paes_ct64_cbcenc$O $(OBJDIR)$Paes_ct64_ctr$O $(OBJDIR)$Paes_ct64_dec$O $(OBJDIR)$Paes_ct64_enc$O $(OBJDIR)$Paes_ct_cbcdec$O $(OBJDIR)$Paes_ct_cbcenc$O $(OBJDIR)$Paes_ct_ctr$O $(OBJDIR)$Paes_ct_dec$O $(OBJDIR)$Paes_ct_enc$O $(OBJDIR)$Paes_pwr8$O $(OBJDIR)$Paes_pwr8_cbcdec$O $(OBJDIR)$Paes_pwr8_cbcenc$O $(OBJDIR)$Paes_pwr8_ctr$O $(OBJDIR)$Paes_small_cbcdec$O $(OBJDIR)$Paes_small_cbcenc$O $(OBJDIR)$Paes_small_ctr$O $(OBJDIR)$Paes_small_dec$O $(OBJDIR)$Paes_small_enc$O $(OBJDIR)$Paes_x86ni$O $(OBJDIR)$Paes_x86ni_cbcdec$O $(OBJDIR)$Paes_x86ni_cbcenc$O $(OBJDIR)$Paes_x86ni_ctr$O $(OBJDIR)$Pchacha20_ct$O $(OBJDIR)$Pdes_ct$O $(OBJDIR)$Pdes_ct_cbcdec$O $(OBJDIR)$Pdes_ct_cbcenc$O $(OBJDIR)$Pdes_support$O $(OBJDIR)$Pdes_tab$O $(OBJDIR)$Pdes_tab_cbcdec$O $(OBJDIR)$Pdes_tab_cbcenc$O $(OBJDIR)$Ppoly1305_ctmul$O $(OBJDIR)$Ppoly1305_ctmul32$O $(OBJDIR)$Ppoly1305_i15$O $(OBJDIR)$Pskey_decoder$O $(OBJDIR)$Px509_decoder$O $(OBJDIR)$Px509_knownkey$O $(OBJDIR)$Px509_minimal$O $(OBJDIR)$Px509_minimal_full$O
 OBJBRSSL = $(OBJDIR)$Pbrssl$O $(OBJDIR)$Pcerts$O $(OBJDIR)$Pchain$O $(OBJDIR)$Pclient$O $(OBJDIR)$Perrors$O $(OBJDIR)$Pfiles$O $(OBJDIR)$Pkeys$O $(OBJDIR)$Pnames$O $(OBJDIR)$Pserver$O $(OBJDIR)$Pskey$O $(OBJDIR)$Psslio$O $(OBJDIR)$Pta$O $(OBJDIR)$Pvector$O $(OBJDIR)$Pverify$O $(OBJDIR)$Pxmem$O
 OBJTESTCRYPTO = $(OBJDIR)$Ptest_crypto$O
 OBJTESTSPEED = $(OBJDIR)$Ptest_speed$O
@@ -212,6 +212,9 @@ $(OBJDIR)$Pghash_ctmul64$O: src$Phash$Pghash_ctmul64.c $(HEADERSPRIV)
 $(OBJDIR)$Pghash_pclmul$O: src$Phash$Pghash_pclmul.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pghash_pclmul$O src$Phash$Pghash_pclmul.c
 
+$(OBJDIR)$Pghash_pwr8$O: src$Phash$Pghash_pwr8.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pghash_pwr8$O src$Phash$Pghash_pwr8.c
+
 $(OBJDIR)$Pmd5$O: src$Phash$Pmd5.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pmd5$O src$Phash$Pmd5.c
 
@@ -617,6 +620,18 @@ $(OBJDIR)$Paes_ct_dec$O: src$Psymcipher$Paes_ct_dec.c $(HEADERSPRIV)
 $(OBJDIR)$Paes_ct_enc$O: src$Psymcipher$Paes_ct_enc.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_ct_enc$O src$Psymcipher$Paes_ct_enc.c
 
+$(OBJDIR)$Paes_pwr8$O: src$Psymcipher$Paes_pwr8.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8$O src$Psymcipher$Paes_pwr8.c
+
+$(OBJDIR)$Paes_pwr8_cbcdec$O: src$Psymcipher$Paes_pwr8_cbcdec.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_cbcdec$O src$Psymcipher$Paes_pwr8_cbcdec.c
+
+$(OBJDIR)$Paes_pwr8_cbcenc$O: src$Psymcipher$Paes_pwr8_cbcenc.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_cbcenc$O src$Psymcipher$Paes_pwr8_cbcenc.c
+
+$(OBJDIR)$Paes_pwr8_ctr$O: src$Psymcipher$Paes_pwr8_ctr.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_pwr8_ctr$O src$Psymcipher$Paes_pwr8_ctr.c
+
 $(OBJDIR)$Paes_small_cbcdec$O: src$Psymcipher$Paes_small_cbcdec.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Paes_small_cbcdec$O src$Psymcipher$Paes_small_cbcdec.c
 
diff --git a/mk/mkrules.sh b/mk/mkrules.sh
index 4c9d2cd..929d305 100755
--- a/mk/mkrules.sh
+++ b/mk/mkrules.sh
@@ -100,6 +100,7 @@ coresrc=" \
 	src/hash/ghash_ctmul32.c \
 	src/hash/ghash_ctmul64.c \
 	src/hash/ghash_pclmul.c \
+	src/hash/ghash_pwr8.c \
 	src/hash/md5.c \
 	src/hash/md5sha1.c \
 	src/hash/multihash.c \
@@ -235,6 +236,10 @@ coresrc=" \
 	src/symcipher/aes_ct_ctr.c \
 	src/symcipher/aes_ct_dec.c \
 	src/symcipher/aes_ct_enc.c \
+	src/symcipher/aes_pwr8.c \
+	src/symcipher/aes_pwr8_cbcdec.c \
+	src/symcipher/aes_pwr8_cbcenc.c \
+	src/symcipher/aes_pwr8_ctr.c \
 	src/symcipher/aes_small_cbcdec.c \
 	src/symcipher/aes_small_cbcenc.c \
 	src/symcipher/aes_small_ctr.c \
diff --git a/src/config.h b/src/config.h
index 9eadaf4..d2e7a7d 100644
--- a/src/config.h
+++ b/src/config.h
@@ -162,4 +162,14 @@
 #define BR_AES_X86NI   1
  */
 
+/*
+ * When BR_POWER8 is enabled, the AES implementation using the POWER ISA
+ * 2.07 opcodes (available on POWER8 processors and later) is compiled.
+ * If this is not enabled explicitly, then that implementation will be
+ * compiled only if a compatible compiler is detected, _and_ the target
+ * architecture is POWER8 or later.
+ *
+#define BR_POWER8   1
+ */
+
 #endif
diff --git a/src/hash/ghash_pwr8.c b/src/hash/ghash_pwr8.c
new file mode 100644
index 0000000..2e7b0f4
--- /dev/null
+++ b/src/hash/ghash_pwr8.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+/*
+ * This is the GHASH implementation that leverages the POWER8 opcodes.
+ */
+
+#if BR_POWER8
+
+/*
+ * Some symbolic names for registers.
+ *   HB0 = 16 bytes of value 0
+ *   HB1 = 16 bytes of value 1
+ *   HB2 = 16 bytes of value 2
+ *   HB6 = 16 bytes of value 6
+ *   HB7 = 16 bytes of value 7
+ *   TT0, TT1 and TT2 are temporaries
+ *
+ * BSW holds the pattern for byteswapping 32-bit words; this is set only
+ * on little-endian systems. XBSW is the same register with the +32 offset
+ * for access with the VSX opcodes.
+ */
+#define HB0     0
+#define HB1     1
+#define HB2     2
+#define HB6     3
+#define HB7     4
+#define TT0     5
+#define TT1     6
+#define TT2     7
+
+#define BSW     8
+#define XBSW   40
+
+/*
+ * Macro to initialise the constants.
+ */
+#define INIT \
+		vxor(HB0, HB0, HB0) \
+		vspltisb(HB1, 1) \
+		vspltisb(HB2, 2) \
+		vspltisb(HB6, 6) \
+		vspltisb(HB7, 7) \
+		INIT_BSW
+
+/*
+ * Fix endianness of a value after reading it or before writing it, if
+ * necessary.
+ */
+#if BR_POWER8_LE
+#define INIT_BSW         lxvw4x(XBSW, 0, %[idx2be])
+#define FIX_ENDIAN(xx)   vperm(xx, xx, xx, BSW)
+#else
+#define INIT_BSW
+#define FIX_ENDIAN(xx)
+#endif
+
+/*
+ * Left-shift x0:x1 by one bit to the left. This is a corrective action
+ * needed because GHASH is defined in full little-endian specification,
+ * while the opcodes use full big-endian convention, so the 255-bit product
+ * ends up one bit to the right.
+ */
+#define SL_256(x0, x1) \
+		vsldoi(TT0, HB0, x1, 1) \
+		vsl(x0, x0, HB1) \
+		vsr(TT0, TT0, HB7) \
+		vsl(x1, x1, HB1) \
+		vxor(x0, x0, TT0)
+
+/*
+ * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
+ * x0 or x1, or a different register). x0 and x1 are modified.
+ */
+#define REDUCE_F128(xd, x0, x1) \
+		vxor(x0, x0, x1) \
+		vsr(TT0, x1, HB1) \
+		vsr(TT1, x1, HB2) \
+		vsr(TT2, x1, HB7) \
+		vxor(x0, x0, TT0) \
+		vxor(TT1, TT1, TT2) \
+		vxor(x0, x0, TT1) \
+		vsldoi(x1, x1, HB0, 15) \
+		vsl(TT1, x1, HB6) \
+		vsl(TT2, x1, HB1) \
+		vxor(x1, TT1, TT2) \
+		vsr(TT0, x1, HB1) \
+		vsr(TT1, x1, HB2) \
+		vsr(TT2, x1, HB7) \
+		vxor(x0, x0, x1) \
+		vxor(x0, x0, TT0) \
+		vxor(TT1, TT1, TT2) \
+		vxor(xd, x0, TT1)
+
+/* see bearssl_hash.h */
+void
+br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
+{
+	const unsigned char *buf1, *buf2;
+	size_t num4, num1;
+	unsigned char tmp[64];
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	buf1 = data;
+
+	/*
+	 * Assembly code requires data into two chunks; first chunk
+	 * must contain a number of blocks which is a multiple of 4.
+	 * Since the processing for the first chunk is faster, we want
+	 * to make it as big as possible.
+	 *
+	 * For the remainder, there are two possibilities:
+	 *  -- if the remainder size is a multiple of 16, then use it
+	 *     in place;
+	 *  -- otherwise, copy it to the tmp[] array and pad it with
+	 *     zeros.
+	 */
+	num4 = len >> 6;
+	buf2 = buf1 + (num4 << 6);
+	len &= 63;
+	num1 = (len + 15) >> 4;
+	if ((len & 15) != 0) {
+		memcpy(tmp, buf2, len);
+		memset(tmp + len, 0, (num1 << 4) - len);
+		buf2 = tmp;
+	}
+
+	cc0 =  0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+		INIT
+
+		/*
+		 * Load current h (denoted hereafter h1) in v9.
+		 */
+		lxvw4x(41, 0, %[h])
+		FIX_ENDIAN(9)
+
+		/*
+		 * Load current y into v28.
+		 */
+		lxvw4x(60, 0, %[y])
+		FIX_ENDIAN(28)
+
+		/*
+		 * Split h1 into three registers:
+		 *   v17 = h1_1:h1_0
+		 *   v18 =    0:h1_0
+		 *   v19 = h1_1:0
+		 */
+		xxpermdi(49, 41, 41, 2)
+		vsldoi(18, HB0, 9, 8)
+		vsldoi(19, 9, HB0, 8)
+
+		/*
+		 * If num4 is 0, skip directly to the second chunk.
+		 */
+		cmpldi(%[num4], 0)
+		beq(chunk1)
+
+		/*
+		 * Compute h2 = h*h in v10.
+		 */
+		vpmsumd(10, 18, 18)
+		vpmsumd(11, 19, 19)
+		SL_256(10, 11)
+		REDUCE_F128(10, 10, 11)
+
+		/*
+		 * Compute h3 = h*h*h in v11.
+		 * We first split h2 into:
+		 *   v10 = h2_0:h2_1
+		 *   v11 =    0:h2_0
+		 *   v12 = h2_1:0
+		 * Then we do the product with h1, and reduce into v11.
+		 */
+		vsldoi(11, HB0, 10, 8)
+		vsldoi(12, 10, HB0, 8)
+		vpmsumd(13, 10, 17)
+		vpmsumd(11, 11, 18)
+		vpmsumd(12, 12, 19)
+		vsldoi(14, HB0, 13, 8)
+		vsldoi(15, 13, HB0, 8)
+		vxor(11, 11, 14)
+		vxor(12, 12, 15)
+		SL_256(11, 12)
+		REDUCE_F128(11, 11, 12)
+
+		/*
+		 * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
+		 */
+		vsldoi(12, HB0, 10, 8)
+		vsldoi(13, 10, HB0, 8)
+		vpmsumd(12, 12, 12)
+		vpmsumd(13, 13, 13)
+		SL_256(12, 13)
+		REDUCE_F128(12, 12, 13)
+
+		/*
+		 * Repack h1, h2, h3 and h4:
+		 *   v13 = h4_0:h3_0
+		 *   v14 = h4_1:h3_1
+		 *   v15 = h2_0:h1_0
+		 *   v16 = h2_1:h1_1
+		 */
+		xxpermdi(45, 44, 43, 0)
+		xxpermdi(46, 44, 43, 3)
+		xxpermdi(47, 42, 41, 0)
+		xxpermdi(48, 42, 41, 3)
+
+		/*
+		 * Loop for each group of four blocks.
+		 */
+		mtctr(%[num4])
+	label(loop4)
+		/*
+		 * Read the four next blocks.
+		 *   v20 = y + a0 = b0
+		 *   v21 = a1     = b1
+		 *   v22 = a2     = b2
+		 *   v23 = a3     = b3
+		 */
+		lxvw4x(52, %[cc0], %[buf1])
+		lxvw4x(53, %[cc1], %[buf1])
+		lxvw4x(54, %[cc2], %[buf1])
+		lxvw4x(55, %[cc3], %[buf1])
+		FIX_ENDIAN(20)
+		FIX_ENDIAN(21)
+		FIX_ENDIAN(22)
+		FIX_ENDIAN(23)
+		addi(%[buf1], %[buf1], 64)
+		vxor(20, 20, 28)
+
+		/*
+		 * Repack the blocks into v9, v10, v11 and v12.
+		 *   v9  = b0_0:b1_0
+		 *   v10 = b0_1:b1_1
+		 *   v11 = b2_0:b3_0
+		 *   v12 = b2_1:b3_1
+		 */
+		xxpermdi(41, 52, 53, 0)
+		xxpermdi(42, 52, 53, 3)
+		xxpermdi(43, 54, 55, 0)
+		xxpermdi(44, 54, 55, 3)
+
+		/*
+		 * Compute the products.
+		 *   v20 = b0_0*h4_0 + b1_0*h3_0
+		 *   v21 = b0_1*h4_0 + b1_1*h3_0
+		 *   v22 = b0_0*h4_1 + b1_0*h3_1
+		 *   v23 = b0_1*h4_1 + b1_1*h3_1
+		 *   v24 = b2_0*h2_0 + b3_0*h1_0
+		 *   v25 = b2_1*h2_0 + b3_1*h1_0
+		 *   v26 = b2_0*h2_1 + b3_0*h1_1
+		 *   v27 = b2_1*h2_1 + b3_1*h1_1
+		 */
+		vpmsumd(20, 13,  9)
+		vpmsumd(21, 13, 10)
+		vpmsumd(22, 14,  9)
+		vpmsumd(23, 14, 10)
+		vpmsumd(24, 15, 11)
+		vpmsumd(25, 15, 12)
+		vpmsumd(26, 16, 11)
+		vpmsumd(27, 16, 12)
+
+		/*
+		 * Sum products into a single 256-bit result in v11:v12.
+		 */
+		vxor(11, 20, 24)
+		vxor(12, 23, 27)
+		vxor( 9, 21, 22)
+		vxor(10, 25, 26)
+		vxor(20,  9, 10)
+		vsldoi( 9, HB0, 20, 8)
+		vsldoi(10, 20, HB0, 8)
+		vxor(11, 11, 9)
+		vxor(12, 12, 10)
+
+		/*
+		 * Fix and reduce in GF(2^128); this is the new y (in v28).
+		 */
+		SL_256(11, 12)
+		REDUCE_F128(28, 11, 12)
+
+		/*
+		 * Loop for next group of four blocks.
+		 */
+		bdnz(loop4)
+
+		/*
+		 * Process second chunk, one block at a time.
+		 */
+	label(chunk1)
+		cmpldi(%[num1], 0)
+		beq(done)
+
+		mtctr(%[num1])
+	label(loop1)
+		/*
+		 * Load next data block and XOR it into y.
+		 */
+		lxvw4x(41, 0, %[buf2])
+#if BR_POWER8_LE
+		FIX_ENDIAN(9)
+#endif
+		addi(%[buf2], %[buf2], 16)
+		vxor(9, 28, 9)
+
+		/*
+		 * Split y into doublewords:
+		 *   v9  = y_0:y_1
+		 *   v10 =   0:y_0
+		 *   v11 = y_1:0
+		 */
+		vsldoi(10, HB0, 9, 8)
+		vsldoi(11, 9, HB0, 8)
+
+		/*
+		 * Compute products with h:
+		 *   v12 = y_0 * h_0
+		 *   v13 = y_1 * h_1
+		 *   v14 = y_1 * h_0 + y_0 * h_1
+		 */
+		vpmsumd(14,  9, 17)
+		vpmsumd(12, 10, 18)
+		vpmsumd(13, 11, 19)
+
+		/*
+		 * Propagate v14 into v12:v13 to finalise product.
+		 */
+		vsldoi(10, HB0, 14, 8)
+		vsldoi(11, 14, HB0, 8)
+		vxor(12, 12, 10)
+		vxor(13, 13, 11)
+
+		/*
+		 * Fix result and reduce into v28 (next value for y).
+		 */
+		SL_256(12, 13)
+		REDUCE_F128(28, 12, 13)
+		bdnz(loop1)
+
+	label(done)
+		/*
+		 * Write back the new y.
+		 */
+		FIX_ENDIAN(28)
+		stxvw4x(60, 0, %[y])
+
+: [buf1] "+b" (buf1), [buf2] "+b" (buf2)
+: [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
+  [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_hash.h */
+br_ghash
+br_ghash_pwr8_get(void)
+{
+	return &br_ghash_pwr8;
+}
+
+#else
+
+/* see bearssl_hash.h */
+br_ghash
+br_ghash_pwr8_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/inner.h b/src/inner.h
index 472dc2e..b03f043 100644
--- a/src/inner.h
+++ b/src/inner.h
@@ -154,6 +154,49 @@
 #define BR_TARGET(x)
 #endif
 
+/*
+ * POWER8 crypto support. We rely on compiler macros for the
+ * architecture, since we do not have a reliable, simple way to detect
+ * the required support at runtime (we could try running an opcode, and
+ * trapping the exception or signal on illegal instruction, but this
+ * induces some non-trivial OS dependencies that we would prefer to
+ * avoid if possible).
+ */
+#ifndef BR_POWER8
+#if __GNUC__ && ((_ARCH_PWR8 || _ARCH_PPC) && __CRYPTO__)
+#define BR_POWER8   1
+#endif
+#endif
+
+/*
+ * Detect endinanness on POWER8.
+ */
+#if BR_POWER8
+#if defined BR_POWER8_LE
+#undef BR_POWER8_BE
+#if BR_POWER8_LE
+#define BR_POWER8_BE   0
+#else
+#define BR_POWER8_BE   1
+#endif
+#elif defined BR_POWER8_BE
+#undef BR_POWER8_LE
+#if BR_POWER8_BE
+#define BR_POWER8_LE   0
+#else
+#define BR_POWER8_LE   1
+#endif
+#else
+#if __LITTLE_ENDIAN__
+#define BR_POWER8_LE   1
+#define BR_POWER8_BE   0
+#else
+#define BR_POWER8_LE   0
+#define BR_POWER8_BE   1
+#endif
+#endif
+#endif
+
 /* ==================================================================== */
 /*
  * Encoding/decoding functions.
@@ -1498,6 +1541,19 @@ unsigned br_aes_x86ni_keysched_enc(unsigned char *skni,
 unsigned br_aes_x86ni_keysched_dec(unsigned char *skni,
 	const void *key, size_t len);
 
+/*
+ * Test support for AES POWER8 opcodes.
+ */
+int br_aes_pwr8_supported(void);
+
+/*
+ * AES key schedule, using POWER8 instructions. This yields the
+ * subkeys in the encryption direction. Number of rounds is returned.
+ * Key size MUST be 16, 24 or 32 bytes; otherwise, 0 is returned.
+ */
+unsigned br_aes_pwr8_keysched(unsigned char *skni,
+	const void *key, size_t len);
+
 /* ==================================================================== */
 /*
  * RSA.
@@ -1775,4 +1831,85 @@ int br_ssl_choose_hash(unsigned bf);
 
 /* ==================================================================== */
 
+/*
+ * PowerPC / POWER assembly stuff. The special BR_POWER_ASM_MACROS macro
+ * must be defined before including this file; this is done by source
+ * files that use some inline assembly for PowerPC / POWER machines.
+ */
+
+#if BR_POWER_ASM_MACROS
+
+#define lxvw4x(xt, ra, rb)        lxvw4x_(xt, ra, rb)
+#define stxvw4x(xt, ra, rb)       stxvw4x_(xt, ra, rb)
+
+#define bdnz(foo)                 bdnz_(foo)
+#define beq(foo)                  beq_(foo)
+
+#define li(rx, value)             li_(rx, value)
+#define addi(rx, ra, imm)         addi_(rx, ra, imm)
+#define cmpldi(rx, imm)           cmpldi_(rx, imm)
+#define mtctr(rx)                 mtctr_(rx)
+#define vspltb(vrt, vrb, uim)     vspltb_(vrt, vrb, uim)
+#define vspltw(vrt, vrb, uim)     vspltw_(vrt, vrb, uim)
+#define vspltisb(vrt, imm)        vspltisb_(vrt, imm)
+#define vspltisw(vrt, imm)        vspltisw_(vrt, imm)
+#define vrlw(vrt, vra, vrb)       vrlw_(vrt, vra, vrb)
+#define vsbox(vrt, vra)           vsbox_(vrt, vra)
+#define vxor(vrt, vra, vrb)       vxor_(vrt, vra, vrb)
+#define vand(vrt, vra, vrb)       vand_(vrt, vra, vrb)
+#define vsro(vrt, vra, vrb)       vsro_(vrt, vra, vrb)
+#define vsl(vrt, vra, vrb)        vsl_(vrt, vra, vrb)
+#define vsldoi(vt, va, vb, sh)    vsldoi_(vt, va, vb, sh)
+#define vsr(vrt, vra, vrb)        vsr_(vrt, vra, vrb)
+#define vadduwm(vrt, vra, vrb)    vadduwm_(vrt, vra, vrb)
+#define vsububm(vrt, vra, vrb)    vsububm_(vrt, vra, vrb)
+#define vsubuwm(vrt, vra, vrb)    vsubuwm_(vrt, vra, vrb)
+#define vsrw(vrt, vra, vrb)       vsrw_(vrt, vra, vrb)
+#define vcipher(vt, va, vb)       vcipher_(vt, va, vb)
+#define vcipherlast(vt, va, vb)   vcipherlast_(vt, va, vb)
+#define vncipher(vt, va, vb)      vncipher_(vt, va, vb)
+#define vncipherlast(vt, va, vb)  vncipherlast_(vt, va, vb)
+#define vperm(vt, va, vb, vc)     vperm_(vt, va, vb, vc)
+#define vpmsumd(vt, va, vb)       vpmsumd_(vt, va, vb)
+#define xxpermdi(vt, va, vb, d)   xxpermdi_(vt, va, vb, d)
+
+#define lxvw4x_(xt, ra, rb)       "\tlxvw4x\t" #xt "," #ra "," #rb "\n"
+#define stxvw4x_(xt, ra, rb)      "\tstxvw4x\t" #xt "," #ra "," #rb "\n"
+
+#define label(foo)                #foo "%=:\n"
+#define bdnz_(foo)                "\tbdnz\t" #foo "%=\n"
+#define beq_(foo)                 "\tbeq\t" #foo "%=\n"
+
+#define li_(rx, value)            "\tli\t" #rx "," #value "\n"
+#define addi_(rx, ra, imm)        "\taddi\t" #rx "," #ra "," #imm "\n"
+#define cmpldi_(rx, imm)          "\tcmpldi\t" #rx "," #imm "\n"
+#define mtctr_(rx)                "\tmtctr\t" #rx "\n"
+#define vspltb_(vrt, vrb, uim)    "\tvspltb\t" #vrt "," #vrb "," #uim "\n"
+#define vspltw_(vrt, vrb, uim)    "\tvspltw\t" #vrt "," #vrb "," #uim "\n"
+#define vspltisb_(vrt, imm)       "\tvspltisb\t" #vrt "," #imm "\n"
+#define vspltisw_(vrt, imm)       "\tvspltisw\t" #vrt "," #imm "\n"
+#define vrlw_(vrt, vra, vrb)      "\tvrlw\t" #vrt "," #vra "," #vrb "\n"
+#define vsbox_(vrt, vra)          "\tvsbox\t" #vrt "," #vra "\n"
+#define vxor_(vrt, vra, vrb)      "\tvxor\t" #vrt "," #vra "," #vrb "\n"
+#define vand_(vrt, vra, vrb)      "\tvand\t" #vrt "," #vra "," #vrb "\n"
+#define vsro_(vrt, vra, vrb)      "\tvsro\t" #vrt "," #vra "," #vrb "\n"
+#define vsl_(vrt, vra, vrb)       "\tvsl\t" #vrt "," #vra "," #vrb "\n"
+#define vsldoi_(vt, va, vb, sh)   "\tvsldoi\t" #vt "," #va "," #vb "," #sh "\n"
+#define vsr_(vrt, vra, vrb)       "\tvsr\t" #vrt "," #vra "," #vrb "\n"
+#define vadduwm_(vrt, vra, vrb)   "\tvadduwm\t" #vrt "," #vra "," #vrb "\n"
+#define vsububm_(vrt, vra, vrb)   "\tvsububm\t" #vrt "," #vra "," #vrb "\n"
+#define vsubuwm_(vrt, vra, vrb)   "\tvsubuwm\t" #vrt "," #vra "," #vrb "\n"
+#define vsrw_(vrt, vra, vrb)      "\tvsrw\t" #vrt "," #vra "," #vrb "\n"
+#define vcipher_(vt, va, vb)      "\tvcipher\t" #vt "," #va "," #vb "\n"
+#define vcipherlast_(vt, va, vb)  "\tvcipherlast\t" #vt "," #va "," #vb "\n"
+#define vncipher_(vt, va, vb)     "\tvncipher\t" #vt "," #va "," #vb "\n"
+#define vncipherlast_(vt, va, vb) "\tvncipherlast\t" #vt "," #va "," #vb "\n"
+#define vperm_(vt, va, vb, vc)    "\tvperm\t" #vt "," #va "," #vb "," #vc "\n"
+#define vpmsumd_(vt, va, vb)      "\tvpmsumd\t" #vt "," #va "," #vb "\n"
+#define xxpermdi_(vt, va, vb, d)  "\txxpermdi\t" #vt "," #va "," #vb "," #d "\n"
+
+#endif
+
+/* ==================================================================== */
+
 #endif
diff --git a/src/ssl/ssl_engine_default_aescbc.c b/src/ssl/ssl_engine_default_aescbc.c
index 556d6eb..8c5cdb5 100644
--- a/src/ssl/ssl_engine_default_aescbc.c
+++ b/src/ssl/ssl_engine_default_aescbc.c
@@ -28,7 +28,7 @@
 void
 br_ssl_engine_set_default_aes_cbc(br_ssl_engine_context *cc)
 {
-#if BR_AES_X86NI
+#if BR_AES_X86NI || BR_POWER8
 	const br_block_cbcenc_class *ienc;
 	const br_block_cbcdec_class *idec;
 #endif
@@ -44,6 +44,14 @@ br_ssl_engine_set_default_aes_cbc(br_ssl_engine_context *cc)
 		return;
 	}
 #endif
+#if BR_POWER8
+	ienc = br_aes_pwr8_cbcenc_get_vtable();
+	idec = br_aes_pwr8_cbcdec_get_vtable();
+	if (ienc != NULL && idec != NULL) {
+		br_ssl_engine_set_aes_cbc(cc, ienc, idec);
+		return;
+	}
+#endif
 #if BR_64
 	br_ssl_engine_set_aes_cbc(cc,
 		&br_aes_ct64_cbcenc_vtable,
diff --git a/src/ssl/ssl_engine_default_aesgcm.c b/src/ssl/ssl_engine_default_aesgcm.c
index 9968342..c44a707 100644
--- a/src/ssl/ssl_engine_default_aesgcm.c
+++ b/src/ssl/ssl_engine_default_aesgcm.c
@@ -28,7 +28,7 @@
 void
 br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc)
 {
-#if BR_AES_X86NI
+#if BR_AES_X86NI || BR_POWER8
 	const br_block_ctr_class *ictr;
 	br_ghash ighash;
 #endif
@@ -47,6 +47,17 @@ br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc)
 		br_ssl_engine_set_aes_ctr(cc, &br_aes_ct_ctr_vtable);
 #endif
 	}
+#elif BR_POWER8
+	ictr = br_aes_pwr8_ctr_get_vtable();
+	if (ictr != NULL) {
+		br_ssl_engine_set_aes_ctr(cc, ictr);
+	} else {
+#if BR_64
+		br_ssl_engine_set_aes_ctr(cc, &br_aes_ct64_ctr_vtable);
+#else
+		br_ssl_engine_set_aes_ctr(cc, &br_aes_ct_ctr_vtable);
+#endif
+	}
 #else
 #if BR_64
 	br_ssl_engine_set_aes_ctr(cc, &br_aes_ct64_ctr_vtable);
@@ -61,6 +72,13 @@ br_ssl_engine_set_default_aes_gcm(br_ssl_engine_context *cc)
 		return;
 	}
 #endif
+#if BR_POWER8
+	ighash = br_ghash_pwr8_get();
+	if (ighash != 0) {
+		br_ssl_engine_set_ghash(cc, ighash);
+		return;
+	}
+#endif
 #if BR_LOMUL
 	br_ssl_engine_set_ghash(cc, &br_ghash_ctmul32);
 #elif BR_64
diff --git a/src/symcipher/aes_pwr8.c b/src/symcipher/aes_pwr8.c
new file mode 100644
index 0000000..b2c63c3
--- /dev/null
+++ b/src/symcipher/aes_pwr8.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+/*
+ * This code contains the AES key schedule implementation using the
+ * POWER8 opcodes.
+ */
+
+#if BR_POWER8
+
+static void
+key_schedule_128(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+	static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2 = current subkey
+		 * v3 = Rcon (x4 words)
+		 * v6 = constant 8, copied into four words
+		 * v7 = constant 0x11B, copied into four words
+		 * v8 = constant for byteswapping words
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		lxvw4x(34, 0, %[key])
+		vspltisw(3, 1)
+		vspltisw(6, 8)
+		lxvw4x(39, 0, %[fmod])
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * First subkey is a copy of the key itself.
+		 */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/*
+		 * Loop must run 10 times.
+		 */
+		li(%[cc], 10)
+		mtctr(%[cc])
+	label(loop)
+		/* Increment subkey address */
+		addi(%[sk], %[sk], 16)
+
+		/* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
+		vrlw(4, 2, 1)
+		vsbox(4, 4)
+#if BR_POWER8_LE
+		vxor(4, 4, 3)
+#else
+		vsldoi(5, 3, 0, 3)
+		vxor(4, 4, 5)
+#endif
+		vspltw(4, 4, 3)
+
+		/* XOR words for next subkey */
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vxor(2, 2, 4)
+
+		/* Store next subkey */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/* Update Rcon */
+		vadduwm(3, 3, 3)
+		vsrw(4, 3, 6)
+		vsubuwm(4, 0, 4)
+		vand(4, 4, 7)
+		vxor(3, 3, 4)
+
+		bdnz(loop)
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key), [fmod] "b" (fmod)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_192(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v5 = Rcon (x4 words) (already shifted on big-endian)
+		 * v6 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 8)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vsldoi(3, 3, 0, 8)
+		vspltisw(5, 1)
+#if !BR_POWER8_LE
+		vsldoi(5, 5, 0, 3)
+#endif
+		vspltisw(6, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 8 times. Each iteration produces 256
+		 * bits of subkeys, with a 64-bit overlap.
+		 */
+		li(%[cc], 8)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Last 6 words in v2:v3l. Compute next 6 words into
+		 * v3r:v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 5)
+		vspltw(10, 10, 1)
+		vsldoi(11, 0, 10, 8)
+
+		vsldoi(12, 0, 2, 12)
+		vxor(12, 2, 12)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+
+		vspltw(13, 12, 3)
+		vxor(13, 13, 3)
+		vsldoi(14, 0, 3, 12)
+		vxor(13, 13, 14)
+
+		vsldoi(4, 12, 13, 8)
+		vsldoi(14, 0, 3, 8)
+		vsldoi(3, 14, 12, 8)
+
+		vxor(3, 3, 11)
+		vxor(4, 4, 10)
+
+		/*
+		 * Update Rcon. Since for a 192-bit key, we use only 8
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(5, 5, 5)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 24)
+
+		/*
+		 * Shift words for next iteration.
+		 */
+		vsldoi(2, 3, 4, 8)
+		vsldoi(3, 4, 0, 8)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 50 subkey words, but we need
+		 * to produce 52, so we must do one last write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_256(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v6 = Rcon (x4 words) (already shifted on big-endian)
+		 * v7 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 16)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vspltisw(6, 1)
+#if !BR_POWER8_LE
+		vsldoi(6, 6, 0, 3)
+#endif
+		vspltisw(7, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 7 times. Each iteration produces two
+		 * subkeys.
+		 */
+		li(%[cc], 7)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Current words are in v2:v3. Compute next word in v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 6)
+		vspltw(10, 10, 3)
+
+		vsldoi(4, 0, 2, 12)
+		vxor(4, 2, 4)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vxor(4, 4, 10)
+
+		/*
+		 * Then other word in v5.
+		 */
+		vsbox(10, 4)
+		vspltw(10, 10, 3)
+
+		vsldoi(5, 0, 3, 12)
+		vxor(5, 3, 5)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vxor(5, 5, 10)
+
+		/*
+		 * Update Rcon. Since for a 256-bit key, we use only 7
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(6, 6, 6)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 32)
+
+		/*
+		 * Replace v2:v3 with v4:v5.
+		 */
+		vxor(2, 0, 4)
+		vxor(3, 0, 5)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 14 subkeys, but we need 15,
+		 * so we must do an extra write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+/* see inner.h */
+int
+br_aes_pwr8_supported(void)
+{
+	return 1;
+}
+
+/* see inner.h */
+unsigned
+br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
+{
+	switch (len) {
+	case 16:
+		key_schedule_128(sk, key);
+		return 10;
+	case 24:
+		key_schedule_192(sk, key);
+		return 12;
+	default:
+		key_schedule_256(sk, key);
+		return 14;
+	}
+}
+
+#endif
diff --git a/src/symcipher/aes_pwr8_cbcdec.c b/src/symcipher/aes_pwr8_cbcdec.c
new file mode 100644
index 0000000..e535ba6
--- /dev/null
+++ b/src/symcipher/aes_pwr8_cbcdec.c
@@ -0,0 +1,670 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_cbcdec_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+cbcdec_128(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 10)
+		vxor(17, 17, 10)
+		vxor(18, 18, 10)
+		vxor(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcdec_192(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 12)
+		vxor(17, 17, 12)
+		vxor(18, 18, 12)
+		vxor(19, 19, 12)
+		vncipher(16, 16, 11)
+		vncipher(17, 17, 11)
+		vncipher(18, 18, 11)
+		vncipher(19, 19, 11)
+		vncipher(16, 16, 10)
+		vncipher(17, 17, 10)
+		vncipher(18, 18, 10)
+		vncipher(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcdec_256(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(45, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(46, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 14)
+		vxor(17, 17, 14)
+		vxor(18, 18, 14)
+		vxor(19, 19, 14)
+		vncipher(16, 16, 13)
+		vncipher(17, 17, 13)
+		vncipher(18, 18, 13)
+		vncipher(19, 19, 13)
+		vncipher(16, 16, 12)
+		vncipher(17, 17, 12)
+		vncipher(18, 18, 12)
+		vncipher(19, 19, 12)
+		vncipher(16, 16, 11)
+		vncipher(17, 17, 11)
+		vncipher(18, 18, 11)
+		vncipher(19, 19, 11)
+		vncipher(16, 16, 10)
+		vncipher(17, 17, 10)
+		vncipher(18, 18, 10)
+		vncipher(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char nextiv[16];
+	unsigned char *buf;
+
+	if (len == 0) {
+		return;
+	}
+	buf = data;
+	memcpy(nextiv, buf + len - 16, 16);
+	if (len >= 64) {
+		size_t num_blocks;
+		unsigned char tmp[16];
+
+		num_blocks = (len >> 4) & ~(size_t)3;
+		memcpy(tmp, buf + (num_blocks << 4) - 16, 16);
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcdec_128(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		case 12:
+			cbcdec_192(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		default:
+			cbcdec_256(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		}
+		buf += num_blocks << 4;
+		len &= 63;
+		memcpy(iv, tmp, 16);
+	}
+	if (len > 0) {
+		unsigned char tmp[64];
+
+		memcpy(tmp, buf, len);
+		memset(tmp + len, 0, (sizeof tmp) - len);
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcdec_128(ctx->skey.skni, iv, tmp, 4);
+			break;
+		case 12:
+			cbcdec_192(ctx->skey.skni, iv, tmp, 4);
+			break;
+		default:
+			cbcdec_256(ctx->skey.skni, iv, tmp, 4);
+			break;
+		}
+		memcpy(buf, tmp, len);
+	}
+	memcpy(iv, nextiv, 16);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable = {
+	sizeof(br_aes_pwr8_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_pwr8_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_pwr8_cbcdec_run
+};
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_pwr8_cbcdec_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcdec_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_pwr8_cbcdec_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/src/symcipher/aes_pwr8_cbcenc.c b/src/symcipher/aes_pwr8_cbcenc.c
new file mode 100644
index 0000000..00f8eca
--- /dev/null
+++ b/src/symcipher/aes_pwr8_cbcenc.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_cbcenc_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+cbcenc_128(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipherlast(16, 16, 10)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcenc_192(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(43, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(44, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipher(16, 16, 10)
+		vcipher(16, 16, 11)
+		vcipherlast(16, 16, 12)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcenc_256(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(43, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(44, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(45, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(46, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipher(16, 16, 10)
+		vcipher(16, 16, 11)
+		vcipher(16, 16, 12)
+		vcipher(16, 16, 13)
+		vcipherlast(16, 16, 14)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	if (len > 0) {
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcenc_128(ctx->skey.skni, iv, data, len);
+			break;
+		case 12:
+			cbcenc_192(ctx->skey.skni, iv, data, len);
+			break;
+		default:
+			cbcenc_256(ctx->skey.skni, iv, data, len);
+			break;
+		}
+		memcpy(iv, (unsigned char *)data + (len - 16), 16);
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable = {
+	sizeof(br_aes_pwr8_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_pwr8_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_pwr8_cbcenc_run
+};
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_pwr8_cbcenc_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcenc_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_pwr8_cbcenc_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/src/symcipher/aes_pwr8_ctr.c b/src/symcipher/aes_pwr8_ctr.c
new file mode 100644
index 0000000..f5d20c0
--- /dev/null
+++ b/src/symcipher/aes_pwr8_ctr.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_ctr_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+ctr_128(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipherlast(16, 16, 10)
+		vcipherlast(17, 17, 10)
+		vcipherlast(18, 18, 10)
+		vcipherlast(19, 19, 10)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+ctr_192(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipher(16, 16, 10)
+		vcipher(17, 17, 10)
+		vcipher(18, 18, 10)
+		vcipher(19, 19, 10)
+		vcipher(16, 16, 11)
+		vcipher(17, 17, 11)
+		vcipher(18, 18, 11)
+		vcipher(19, 19, 11)
+		vcipherlast(16, 16, 12)
+		vcipherlast(17, 17, 12)
+		vcipherlast(18, 18, 12)
+		vcipherlast(19, 19, 12)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+ctr_256(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(45, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(46, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipher(16, 16, 10)
+		vcipher(17, 17, 10)
+		vcipher(18, 18, 10)
+		vcipher(19, 19, 10)
+		vcipher(16, 16, 11)
+		vcipher(17, 17, 11)
+		vcipher(18, 18, 11)
+		vcipher(19, 19, 11)
+		vcipher(16, 16, 12)
+		vcipher(17, 17, 12)
+		vcipher(18, 18, 12)
+		vcipher(19, 19, 12)
+		vcipher(16, 16, 13)
+		vcipher(17, 17, 13)
+		vcipher(18, 18, 13)
+		vcipher(19, 19, 13)
+		vcipherlast(16, 16, 14)
+		vcipherlast(17, 17, 14)
+		vcipherlast(18, 18, 14)
+		vcipherlast(19, 19, 14)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char ivbuf[64];
+
+	buf = data;
+	memcpy(ivbuf +  0, iv, 12);
+	memcpy(ivbuf + 16, iv, 12);
+	memcpy(ivbuf + 32, iv, 12);
+	memcpy(ivbuf + 48, iv, 12);
+	if (len >= 64) {
+		br_enc32be(ivbuf + 12, cc + 0);
+		br_enc32be(ivbuf + 28, cc + 1);
+		br_enc32be(ivbuf + 44, cc + 2);
+		br_enc32be(ivbuf + 60, cc + 3);
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		}
+		cc += (len >> 4) & ~(size_t)3;
+		buf += len & ~(size_t)63;
+		len &= 63;
+	}
+	if (len > 0) {
+		unsigned char tmp[64];
+
+		memcpy(tmp, buf, len);
+		memset(tmp + len, 0, (sizeof tmp) - len);
+		br_enc32be(ivbuf + 12, cc + 0);
+		br_enc32be(ivbuf + 28, cc + 1);
+		br_enc32be(ivbuf + 44, cc + 2);
+		br_enc32be(ivbuf + 60, cc + 3);
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		}
+		memcpy(buf, tmp, len);
+		cc += (len + 15) >> 4;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_pwr8_ctr_vtable = {
+	sizeof(br_aes_pwr8_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_pwr8_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_pwr8_ctr_run
+};
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_pwr8_ctr_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctr_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_pwr8_ctr_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/test_crypto.c b/test/test_crypto.c
index ca7234d..c05fca5 100644
--- a/test/test_crypto.c
+++ b/test/test_crypto.c
@@ -3297,6 +3297,33 @@ test_AES_x86ni(void)
 	}
 }
 
+static void
+test_AES_pwr8(void)
+{
+	const br_block_cbcenc_class *x_cbcenc;
+	const br_block_cbcdec_class *x_cbcdec;
+	const br_block_ctr_class *x_ctr;
+	int hcbcenc, hcbcdec, hctr;
+
+	x_cbcenc = br_aes_pwr8_cbcenc_get_vtable();
+	x_cbcdec = br_aes_pwr8_cbcdec_get_vtable();
+	x_ctr = br_aes_pwr8_ctr_get_vtable();
+	hcbcenc = (x_cbcenc != NULL);
+	hcbcdec = (x_cbcdec != NULL);
+	hctr = (x_ctr != NULL);
+	if (hcbcenc != hctr || hcbcdec != hctr) {
+		fprintf(stderr, "AES_pwr8 availability mismatch (%d/%d/%d)\n",
+			hcbcenc, hcbcdec, hctr);
+		exit(EXIT_FAILURE);
+	}
+	if (hctr) {
+		test_AES_generic("AES_pwr8",
+			x_cbcenc, x_cbcdec, x_ctr, 1, 1);
+	} else {
+		printf("Test AES_pwr8: UNAVAILABLE\n");
+	}
+}
+
 /*
  * DES known-answer tests. Order: plaintext, key, ciphertext.
  * (mostly from NIST SP 800-20).
@@ -4675,6 +4702,31 @@ test_GHASH(const char *name, br_ghash gh)
 		check_equals("KAT GHASH", y, ref, sizeof ref);
 	}
 
+	for (u = 0; u <= 1024; u ++) {
+		unsigned char key[32], iv[12];
+		unsigned char buf[1024 + 32];
+		unsigned char y0[16], y1[16];
+		char tmp[100];
+
+		memset(key, 0, sizeof key);
+		memset(iv, 0, sizeof iv);
+		br_enc32be(key, u);
+		memset(buf, 0, sizeof buf);
+		br_chacha20_ct_run(key, iv, 1, buf, sizeof buf);
+
+		memcpy(y0, buf, 16);
+		br_ghash_ctmul32(y0, buf + 16, buf + 32, u);
+		memcpy(y1, buf, 16);
+		gh(y1, buf + 16, buf + 32, u);
+		sprintf(tmp, "XREF %s (len = %u)", name, (unsigned)u);
+		check_equals(tmp, y0, y1, 16);
+
+		if ((u & 31) == 0) {
+			printf(".");
+			fflush(stdout);
+		}
+	}
+
 	printf("done.\n");
 	fflush(stdout);
 }
@@ -4710,6 +4762,19 @@ test_GHASH_pclmul(void)
 	}
 }
 
+static void
+test_GHASH_pwr8(void)
+{
+	br_ghash gh;
+
+	gh = br_ghash_pwr8_get();
+	if (gh == 0) {
+		printf("Test GHASH_pwr8: UNAVAILABLE\n");
+	} else {
+		test_GHASH("GHASH_pwr8", gh);
+	}
+}
+
 static void
 test_EC_inner(const char *sk, const char *sU,
 	const br_ec_impl *impl, int curve)
@@ -5598,6 +5663,7 @@ static const struct {
 	STU(AES_small),
 	STU(AES_ct),
 	STU(AES_ct64),
+	STU(AES_pwr8),
 	STU(AES_x86ni),
 	STU(DES_tab),
 	STU(DES_ct),
@@ -5612,6 +5678,7 @@ static const struct {
 	STU(GHASH_ctmul32),
 	STU(GHASH_ctmul64),
 	STU(GHASH_pclmul),
+	STU(GHASH_pwr8),
 	STU(EC_prime_i15),
 	STU(EC_prime_i31),
 	STU(EC_p256_m15),
diff --git a/test/test_speed.c b/test/test_speed.c
index d7dfaad..6981299 100644
--- a/test/test_speed.c
+++ b/test/test_speed.c
@@ -249,6 +249,7 @@ SPEED_AES(small)
 SPEED_AES(ct)
 SPEED_AES(ct64)
 SPEED_AES(x86ni)
+SPEED_AES(pwr8)
 
 #define br_des_tab_cbcenc_get_vtable()     (&br_des_tab_cbcenc_vtable)
 #define br_des_tab_cbcdec_get_vtable()     (&br_des_tab_cbcdec_vtable)
@@ -334,6 +335,20 @@ test_speed_ghash_pclmul(void)
 	}
 }
 
+static void
+test_speed_ghash_pwr8(void)
+{
+	br_ghash gh;
+
+	gh = br_ghash_pwr8_get();
+	if (gh == 0) {
+		printf("%-30s UNAVAILABLE\n", "GHASH (pwr8)");
+		fflush(stdout);
+	} else {
+		test_speed_ghash_inner("GHASH (pwr8)", gh);
+	}
+}
+
 static uint32_t
 fake_chacha20(const void *key, const void *iv,
 	uint32_t cc, void *data, size_t len)
@@ -1215,6 +1230,16 @@ static const struct {
 	STU(aes192_x86ni_ctr),
 	STU(aes256_x86ni_ctr),
 
+	STU(aes128_pwr8_cbcenc),
+	STU(aes128_pwr8_cbcdec),
+	STU(aes192_pwr8_cbcenc),
+	STU(aes192_pwr8_cbcdec),
+	STU(aes256_pwr8_cbcenc),
+	STU(aes256_pwr8_cbcdec),
+	STU(aes128_pwr8_ctr),
+	STU(aes192_pwr8_ctr),
+	STU(aes256_pwr8_ctr),
+
 	STU(des_tab_cbcenc),
 	STU(des_tab_cbcdec),
 	STU(3des_tab_cbcenc),
@@ -1231,6 +1256,7 @@ static const struct {
 	STU(ghash_ctmul32),
 	STU(ghash_ctmul64),
 	STU(ghash_pclmul),
+	STU(ghash_pwr8),
 
 	STU(poly1305_ctmul),
 	STU(poly1305_ctmul32),
diff --git a/tools/names.c b/tools/names.c
index 753a736..a8bb645 100644
--- a/tools/names.c
+++ b/tools/names.c
@@ -406,6 +406,12 @@ static const struct {
 	const char *short_name;
 	const void *(*get)(void);
 } algo_names_dyn[] = {
+	{ "aes_pwr8_cbcenc",   "pwr8",
+		(const void *(*)(void))&br_aes_pwr8_cbcenc_get_vtable },
+	{ "aes_pwr8_cbcdec",   "pwr8",
+		(const void *(*)(void))&br_aes_pwr8_cbcdec_get_vtable },
+	{ "aes_pwr8_ctr",      "pwr8",
+		(const void *(*)(void))&br_aes_pwr8_ctr_get_vtable },
 	{ "aes_x86ni_cbcenc",  "x86ni",
 		(const void *(*)(void))&br_aes_x86ni_cbcenc_get_vtable },
 	{ "aes_x86ni_cbcdec",  "x86ni",
@@ -414,6 +420,8 @@ static const struct {
 		(const void *(*)(void))&br_aes_x86ni_ctr_get_vtable },
 	{ "ghash_pclmul",      "pclmul",
 		(const void *(*)(void))&br_ghash_pclmul_get },
+	{ "ghash_pwr8",        "pwr8",
+		(const void *(*)(void))&br_ghash_pwr8_get },
 	{ 0, 0, 0, }
 };