From 491a45337de8dc0a4c100abf33f5c0e187a08afd Mon Sep 17 00:00:00 2001
From: Thomas Pornin <pornin@bolet.org>
Date: Tue, 31 Jul 2018 23:00:26 +0200
Subject: [PATCH 1/1] Added RSA key generation code (i15, i31, i62).

---
 inc/bearssl_rsa.h              | 158 ++++++++-
 mk/Rules.mk                    |  28 ++
 mk/mkrules.sh                  |   7 +
 src/inner.h                    |  67 +++-
 src/int/i15_moddiv.c           | 465 +++++++++++++++++++++++++
 src/int/i31_moddiv.c           | 488 ++++++++++++++++++++++++++
 src/int/i62_modpow2.c          |  16 +-
 src/rsa/rsa_default_keygen.c   |  38 +++
 src/rsa/rsa_i15_keygen.c       | 583 +++++++++++++++++++++++++++++++
 src/rsa/rsa_i31_keygen.c       |  37 ++
 src/rsa/rsa_i31_keygen_inner.c | 608 +++++++++++++++++++++++++++++++++
 src/rsa/rsa_i62_keygen.c       |  57 ++++
 test/test_crypto.c             | 127 ++++++-
 test/test_speed.c              |  68 +++-
 tools/skey.c                   | 131 ++++++-
 15 files changed, 2865 insertions(+), 13 deletions(-)
 create mode 100644 src/int/i15_moddiv.c
 create mode 100644 src/int/i31_moddiv.c
 create mode 100644 src/rsa/rsa_default_keygen.c
 create mode 100644 src/rsa/rsa_i15_keygen.c
 create mode 100644 src/rsa/rsa_i31_keygen.c
 create mode 100644 src/rsa/rsa_i31_keygen_inner.c
 create mode 100644 src/rsa/rsa_i62_keygen.c

diff --git a/inc/bearssl_rsa.h b/inc/bearssl_rsa.h
index ac0415f..7c79d33 100644
--- a/inc/bearssl_rsa.h
+++ b/inc/bearssl_rsa.h
@@ -84,7 +84,7 @@ extern "C" {
  *     random masking, but "true" constant-time code.
  *
  *   - They support only private keys with two prime factors. RSA private
- *     key with three or more prime factors are nominally supported, but
+ *     keys with three or more prime factors are nominally supported, but
  *     rarely used; they may offer faster operations, at the expense of
  *     more code and potentially a reduction in security if there are
  *     "too many" prime factors.
@@ -92,7 +92,7 @@ extern "C" {
  *   - The public exponent may have arbitrary length. Of course, it is
  *     a good idea to keep public exponents small, so that public key
  *     operations are fast; but, contrary to some widely deployed
- *     implementations, BearSSL has no problem with public exponent
+ *     implementations, BearSSL has no problem with public exponents
  *     longer than 32 bits.
  *
  *   - The two prime factors of the modulus need not have the same length
@@ -170,7 +170,7 @@ typedef struct {
 /**
  * \brief RSA private key.
  *
- * The structure references the primvate factors, reduced private
+ * The structure references the private factors, reduced private
  * exponents, and CRT coefficient. It also contains the bit length of
  * the modulus. The big integers use unsigned big-endian representation;
  * extra leading bytes of value 0 are allowed. However, the modulus bit
@@ -1032,6 +1032,158 @@ uint32_t br_rsa_i62_oaep_decrypt(
 	const br_hash_class *dig, const void *label, size_t label_len,
 	const br_rsa_private_key *sk, void *data, size_t *len);
 
+/**
+ * \brief Get buffer size to hold RSA private key elements.
+ *
+ * This macro returns the length (in bytes) of the buffer needed to
+ * receive the elements of a RSA private key, as generated by one of
+ * the `br_rsa_*_keygen()` functions. If the provided size is a constant
+ * expression, then the whole macro evaluates to a constant expression.
+ *
+ * \param size   target key size (modulus size, in bits)
+ * \return  the length of the private key buffer, in bytes.
+ */
+#define BR_RSA_KBUF_PRIV_SIZE(size)    (5 * (((size) + 15) >> 4))
+
+/**
+ * \brief Get buffer size to hold RSA public key elements.
+ *
+ * This macro returns the length (in bytes) of the buffer needed to
+ * receive the elements of a RSA public key, as generated by one of
+ * the `br_rsa_*_keygen()` functions. If the provided size is a constant
+ * expression, then the whole macro evaluates to a constant expression.
+ *
+ * \param size   target key size (modulus size, in bits)
+ * \return  the length of the public key buffer, in bytes.
+ */
+#define BR_RSA_KBUF_PUB_SIZE(size)     (4 + (((size) + 7) >> 3))
+
+/**
+ * \brief Type for RSA key pair generator implementation.
+ *
+ * This function generates a new RSA key pair whose modulus has bit
+ * length `size` bits. The private key elements are written in the
+ * `kbuf_priv` buffer, and pointer values and length fields to these
+ * elements are populated in the provided private key structure `sk`.
+ * Similarly, the public key elements are written in `kbuf_pub`, with
+ * pointers and lengths set in `pk`.
+ *
+ * If `sk` is `NULL`, then `kbuf_pub` may be `NULL`, and only the
+ * private key is set.
+ *
+ * If `pubexp` is not zero, then its value will be used as public
+ * exponent. Valid RSA public exponent values are odd integers
+ * greater than 1. If `pubexp` is zero, then the public exponent will
+ * have value 3.
+ *
+ * The provided PRNG (`rng_ctx`) must have already been initialized
+ * and seeded.
+ *
+ * Returned value is 1 on success, 0 on error. An error is reported
+ * if the requested range is outside of the supported key sizes, or
+ * if an invalid non-zero public exponent value is provided. Supported
+ * range starts at 512 bits, and up to an implementation-defined
+ * maximum (by default 4096 bits). Note that key sizes up to 768 bits
+ * have been broken in practice, and sizes lower than 2048 bits are
+ * usually considered to be weak and should not be used.
+ *
+ * \param rng_ctx     source PRNG context (already initialized)
+ * \param sk          RSA private key structure (destination)
+ * \param kbuf_priv   buffer for private key elements
+ * \param pk          RSA public key structure (destination), or `NULL`
+ * \param kbuf_pub    buffer for public key elements, or `NULL`
+ * \param size        target RSA modulus size (in bits)
+ * \param pubexp      public exponent to use, or zero
+ * \return  1 on success, 0 on error (invalid parameters)
+ */
+typedef uint32_t (*br_rsa_keygen)(
+	const br_prng_class **rng_ctx,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp);
+
+/**
+ * \brief RSA key pair generation with the "i15" engine.
+ *
+ * \see br_rsa_keygen
+ *
+ * \param rng_ctx     source PRNG context (already initialized)
+ * \param sk          RSA private key structure (destination)
+ * \param kbuf_priv   buffer for private key elements
+ * \param pk          RSA public key structure (destination), or `NULL`
+ * \param kbuf_pub    buffer for public key elements, or `NULL`
+ * \param size        target RSA modulus size (in bits)
+ * \param pubexp      public exponent to use, or zero
+ * \return  1 on success, 0 on error (invalid parameters)
+ */
+uint32_t br_rsa_i15_keygen(
+	const br_prng_class **rng_ctx,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp);
+
+/**
+ * \brief RSA key pair generation with the "i31" engine.
+ *
+ * \see br_rsa_keygen
+ *
+ * \param rng_ctx     source PRNG context (already initialized)
+ * \param sk          RSA private key structure (destination)
+ * \param kbuf_priv   buffer for private key elements
+ * \param pk          RSA public key structure (destination), or `NULL`
+ * \param kbuf_pub    buffer for public key elements, or `NULL`
+ * \param size        target RSA modulus size (in bits)
+ * \param pubexp      public exponent to use, or zero
+ * \return  1 on success, 0 on error (invalid parameters)
+ */
+uint32_t br_rsa_i31_keygen(
+	const br_prng_class **rng_ctx,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp);
+
+/**
+ * \brief RSA key pair generation with the "i62" engine.
+ *
+ * This function is defined only on architecture that offer a 64x64->128
+ * opcode. Use `br_rsa_i62_keygen_get()` to dynamically obtain a pointer
+ * to that function.
+ *
+ * \see br_rsa_keygen
+ *
+ * \param rng_ctx     source PRNG context (already initialized)
+ * \param sk          RSA private key structure (destination)
+ * \param kbuf_priv   buffer for private key elements
+ * \param pk          RSA public key structure (destination), or `NULL`
+ * \param kbuf_pub    buffer for public key elements, or `NULL`
+ * \param size        target RSA modulus size (in bits)
+ * \param pubexp      public exponent to use, or zero
+ * \return  1 on success, 0 on error (invalid parameters)
+ */
+uint32_t br_rsa_i62_keygen(
+	const br_prng_class **rng_ctx,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp);
+
+/**
+ * \brief Get the RSA "i62" implementation (key pair generation),
+ * if available.
+ *
+ * \return  the implementation, or 0.
+ */
+br_rsa_keygen br_rsa_i62_keygen_get(void);
+
+/**
+ * \brief Get "default" RSA implementation (key pair generation).
+ *
+ * This returns the preferred implementation of RSA (key pair generation)
+ * on the current system.
+ *
+ * \return  the default implementation.
+ */
+br_rsa_keygen br_rsa_keygen_get_default(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mk/Rules.mk b/mk/Rules.mk
index 7a2fac2..1ffc020 100644
--- a/mk/Rules.mk
+++ b/mk/Rules.mk
@@ -72,6 +72,7 @@ OBJ = \
  $(OBJDIR)$Pi15_encode$O \
  $(OBJDIR)$Pi15_fmont$O \
  $(OBJDIR)$Pi15_iszero$O \
+ $(OBJDIR)$Pi15_moddiv$O \
  $(OBJDIR)$Pi15_modpow$O \
  $(OBJDIR)$Pi15_modpow2$O \
  $(OBJDIR)$Pi15_montmul$O \
@@ -90,6 +91,7 @@ OBJ = \
  $(OBJDIR)$Pi31_encode$O \
  $(OBJDIR)$Pi31_fmont$O \
  $(OBJDIR)$Pi31_iszero$O \
+ $(OBJDIR)$Pi31_moddiv$O \
  $(OBJDIR)$Pi31_modpow$O \
  $(OBJDIR)$Pi31_modpow2$O \
  $(OBJDIR)$Pi31_montmul$O \
@@ -122,18 +124,22 @@ OBJ = \
  $(OBJDIR)$Phmac_ct$O \
  $(OBJDIR)$Phmac_drbg$O \
  $(OBJDIR)$Psysrng$O \
+ $(OBJDIR)$Prsa_default_keygen$O \
  $(OBJDIR)$Prsa_default_oaep_decrypt$O \
  $(OBJDIR)$Prsa_default_oaep_encrypt$O \
  $(OBJDIR)$Prsa_default_pkcs1_sign$O \
  $(OBJDIR)$Prsa_default_pkcs1_vrfy$O \
  $(OBJDIR)$Prsa_default_priv$O \
  $(OBJDIR)$Prsa_default_pub$O \
+ $(OBJDIR)$Prsa_i15_keygen$O \
  $(OBJDIR)$Prsa_i15_oaep_decrypt$O \
  $(OBJDIR)$Prsa_i15_oaep_encrypt$O \
  $(OBJDIR)$Prsa_i15_pkcs1_sign$O \
  $(OBJDIR)$Prsa_i15_pkcs1_vrfy$O \
  $(OBJDIR)$Prsa_i15_priv$O \
  $(OBJDIR)$Prsa_i15_pub$O \
+ $(OBJDIR)$Prsa_i31_keygen$O \
+ $(OBJDIR)$Prsa_i31_keygen_inner$O \
  $(OBJDIR)$Prsa_i31_oaep_decrypt$O \
  $(OBJDIR)$Prsa_i31_oaep_encrypt$O \
  $(OBJDIR)$Prsa_i31_pkcs1_sign$O \
@@ -146,6 +152,7 @@ OBJ = \
  $(OBJDIR)$Prsa_i32_pkcs1_vrfy$O \
  $(OBJDIR)$Prsa_i32_priv$O \
  $(OBJDIR)$Prsa_i32_pub$O \
+ $(OBJDIR)$Prsa_i62_keygen$O \
  $(OBJDIR)$Prsa_i62_oaep_decrypt$O \
  $(OBJDIR)$Prsa_i62_oaep_encrypt$O \
  $(OBJDIR)$Prsa_i62_pkcs1_sign$O \
@@ -544,6 +551,9 @@ $(OBJDIR)$Pi15_fmont$O: src$Pint$Pi15_fmont.c $(HEADERSPRIV)
 $(OBJDIR)$Pi15_iszero$O: src$Pint$Pi15_iszero.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi15_iszero$O src$Pint$Pi15_iszero.c
 
+$(OBJDIR)$Pi15_moddiv$O: src$Pint$Pi15_moddiv.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi15_moddiv$O src$Pint$Pi15_moddiv.c
+
 $(OBJDIR)$Pi15_modpow$O: src$Pint$Pi15_modpow.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi15_modpow$O src$Pint$Pi15_modpow.c
 
@@ -598,6 +608,9 @@ $(OBJDIR)$Pi31_fmont$O: src$Pint$Pi31_fmont.c $(HEADERSPRIV)
 $(OBJDIR)$Pi31_iszero$O: src$Pint$Pi31_iszero.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi31_iszero$O src$Pint$Pi31_iszero.c
 
+$(OBJDIR)$Pi31_moddiv$O: src$Pint$Pi31_moddiv.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi31_moddiv$O src$Pint$Pi31_moddiv.c
+
 $(OBJDIR)$Pi31_modpow$O: src$Pint$Pi31_modpow.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Pi31_modpow$O src$Pint$Pi31_modpow.c
 
@@ -694,6 +707,9 @@ $(OBJDIR)$Phmac_drbg$O: src$Prand$Phmac_drbg.c $(HEADERSPRIV)
 $(OBJDIR)$Psysrng$O: src$Prand$Psysrng.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Psysrng$O src$Prand$Psysrng.c
 
+$(OBJDIR)$Prsa_default_keygen$O: src$Prsa$Prsa_default_keygen.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_default_keygen$O src$Prsa$Prsa_default_keygen.c
+
 $(OBJDIR)$Prsa_default_oaep_decrypt$O: src$Prsa$Prsa_default_oaep_decrypt.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_default_oaep_decrypt$O src$Prsa$Prsa_default_oaep_decrypt.c
 
@@ -712,6 +728,9 @@ $(OBJDIR)$Prsa_default_priv$O: src$Prsa$Prsa_default_priv.c $(HEADERSPRIV)
 $(OBJDIR)$Prsa_default_pub$O: src$Prsa$Prsa_default_pub.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_default_pub$O src$Prsa$Prsa_default_pub.c
 
+$(OBJDIR)$Prsa_i15_keygen$O: src$Prsa$Prsa_i15_keygen.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i15_keygen$O src$Prsa$Prsa_i15_keygen.c
+
 $(OBJDIR)$Prsa_i15_oaep_decrypt$O: src$Prsa$Prsa_i15_oaep_decrypt.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i15_oaep_decrypt$O src$Prsa$Prsa_i15_oaep_decrypt.c
 
@@ -730,6 +749,12 @@ $(OBJDIR)$Prsa_i15_priv$O: src$Prsa$Prsa_i15_priv.c $(HEADERSPRIV)
 $(OBJDIR)$Prsa_i15_pub$O: src$Prsa$Prsa_i15_pub.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i15_pub$O src$Prsa$Prsa_i15_pub.c
 
+$(OBJDIR)$Prsa_i31_keygen$O: src$Prsa$Prsa_i31_keygen.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i31_keygen$O src$Prsa$Prsa_i31_keygen.c
+
+$(OBJDIR)$Prsa_i31_keygen_inner$O: src$Prsa$Prsa_i31_keygen_inner.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i31_keygen_inner$O src$Prsa$Prsa_i31_keygen_inner.c
+
 $(OBJDIR)$Prsa_i31_oaep_decrypt$O: src$Prsa$Prsa_i31_oaep_decrypt.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i31_oaep_decrypt$O src$Prsa$Prsa_i31_oaep_decrypt.c
 
@@ -766,6 +791,9 @@ $(OBJDIR)$Prsa_i32_priv$O: src$Prsa$Prsa_i32_priv.c $(HEADERSPRIV)
 $(OBJDIR)$Prsa_i32_pub$O: src$Prsa$Prsa_i32_pub.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i32_pub$O src$Prsa$Prsa_i32_pub.c
 
+$(OBJDIR)$Prsa_i62_keygen$O: src$Prsa$Prsa_i62_keygen.c $(HEADERSPRIV)
+	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i62_keygen$O src$Prsa$Prsa_i62_keygen.c
+
 $(OBJDIR)$Prsa_i62_oaep_decrypt$O: src$Prsa$Prsa_i62_oaep_decrypt.c $(HEADERSPRIV)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(CCOUT)$(OBJDIR)$Prsa_i62_oaep_decrypt$O src$Prsa$Prsa_i62_oaep_decrypt.c
 
diff --git a/mk/mkrules.sh b/mk/mkrules.sh
index 44d6148..038d8c0 100755
--- a/mk/mkrules.sh
+++ b/mk/mkrules.sh
@@ -120,6 +120,7 @@ coresrc=" \
 	src/int/i15_encode.c \
 	src/int/i15_fmont.c \
 	src/int/i15_iszero.c \
+	src/int/i15_moddiv.c \
 	src/int/i15_modpow.c \
 	src/int/i15_modpow2.c \
 	src/int/i15_montmul.c \
@@ -138,6 +139,7 @@ coresrc=" \
 	src/int/i31_encode.c \
 	src/int/i31_fmont.c \
 	src/int/i31_iszero.c \
+	src/int/i31_moddiv.c \
 	src/int/i31_modpow.c \
 	src/int/i31_modpow2.c \
 	src/int/i31_montmul.c \
@@ -170,18 +172,22 @@ coresrc=" \
 	src/mac/hmac_ct.c \
 	src/rand/hmac_drbg.c \
 	src/rand/sysrng.c \
+	src/rsa/rsa_default_keygen.c \
 	src/rsa/rsa_default_oaep_decrypt.c \
 	src/rsa/rsa_default_oaep_encrypt.c \
 	src/rsa/rsa_default_pkcs1_sign.c \
 	src/rsa/rsa_default_pkcs1_vrfy.c \
 	src/rsa/rsa_default_priv.c \
 	src/rsa/rsa_default_pub.c \
+	src/rsa/rsa_i15_keygen.c \
 	src/rsa/rsa_i15_oaep_decrypt.c \
 	src/rsa/rsa_i15_oaep_encrypt.c \
 	src/rsa/rsa_i15_pkcs1_sign.c \
 	src/rsa/rsa_i15_pkcs1_vrfy.c \
 	src/rsa/rsa_i15_priv.c \
 	src/rsa/rsa_i15_pub.c \
+	src/rsa/rsa_i31_keygen.c \
+	src/rsa/rsa_i31_keygen_inner.c \
 	src/rsa/rsa_i31_oaep_decrypt.c \
 	src/rsa/rsa_i31_oaep_encrypt.c \
 	src/rsa/rsa_i31_pkcs1_sign.c \
@@ -194,6 +200,7 @@ coresrc=" \
 	src/rsa/rsa_i32_pkcs1_vrfy.c \
 	src/rsa/rsa_i32_priv.c \
 	src/rsa/rsa_i32_pub.c \
+	src/rsa/rsa_i62_keygen.c \
 	src/rsa/rsa_i62_oaep_decrypt.c \
 	src/rsa/rsa_i62_oaep_encrypt.c \
 	src/rsa/rsa_i62_pkcs1_sign.c \
diff --git a/src/inner.h b/src/inner.h
index 20211d7..c507102 100644
--- a/src/inner.h
+++ b/src/inner.h
@@ -48,10 +48,22 @@
  * already set their root keys to RSA-4096, so we should be able to
  * process such keys.
  *
- * This value MUST be a multiple of 64.
+ * This value MUST be a multiple of 64. This value MUST NOT exceed 47666
+ * (some computations in RSA key generation rely on the factor size being
+ * no more than 23833 bits). RSA key sizes beyond 3072 bits don't make a
+ * lot of sense anyway.
  */
 #define BR_MAX_RSA_SIZE   4096
 
+/*
+ * Minimum size for a RSA modulus (in bits); this value is used only to
+ * filter out invalid parameters for key pair generation. Normally,
+ * applications should not use RSA keys smaller than 2048 bits; but some
+ * specific cases might need shorter keys, for legacy or research
+ * purposes.
+ */
+#define BR_MIN_RSA_SIZE   512
+
 /*
  * Maximum size for a RSA factor (in bits). This is for RSA private-key
  * operations. Default is to support factors up to a bit more than half
@@ -1474,6 +1486,23 @@ uint32_t br_i31_modpow_opt(uint32_t *x, const unsigned char *e, size_t elen,
  */
 void br_i31_mulacc(uint32_t *d, const uint32_t *a, const uint32_t *b);
 
+/*
+ * Compute x/y mod m, result in x. Values x and y must be between 0 and
+ * m-1, and have the same announced bit length as m. Modulus m must be
+ * odd. The "m0i" parameter is equal to -1/m mod 2^31. The array 't'
+ * must point to a temporary area that can hold at least three integers
+ * of the size of m.
+ *
+ * m may not overlap x and y. x and y may overlap each other (this can
+ * be useful to test whether a value is invertible modulo m). t must be
+ * disjoint from all other arrays.
+ *
+ * Returned value is 1 on success, 0 otherwise. Success is attained if
+ * y is invertible modulo m.
+ */
+uint32_t br_i31_moddiv(uint32_t *x, const uint32_t *y,
+	const uint32_t *m, uint32_t m0i, uint32_t *t);
+
 /* ==================================================================== */
 
 /*
@@ -1528,9 +1557,36 @@ void br_i15_reduce(uint16_t *x, const uint16_t *a, const uint16_t *m);
 
 void br_i15_mulacc(uint16_t *d, const uint16_t *a, const uint16_t *b);
 
+uint32_t br_i15_moddiv(uint16_t *x, const uint16_t *y,
+	const uint16_t *m, uint16_t m0i, uint16_t *t);
+
+/*
+ * Variant of br_i31_modpow_opt() that internally uses 64x64->128
+ * multiplications. It expects the same parameters as br_i31_modpow_opt(),
+ * except that the temporaries should be 64-bit integers, not 32-bit
+ * integers.
+ */
 uint32_t br_i62_modpow_opt(uint32_t *x31, const unsigned char *e, size_t elen,
 	const uint32_t *m31, uint32_t m0i31, uint64_t *tmp, size_t twlen);
 
+/*
+ * Type for a function with the same API as br_i31_modpow_opt() (some
+ * implementations of this type may have stricter alignment requirements
+ * on the temporaries).
+ */
+typedef uint32_t (*br_i31_modpow_opt_type)(uint32_t *x,
+	const unsigned char *e, size_t elen,
+	const uint32_t *m, uint32_t m0i, uint32_t *tmp, size_t twlen);
+
+/*
+ * Wrapper for br_i62_modpow_opt() that uses the same type as
+ * br_i31_modpow_opt(); however, it requires its 'tmp' argument to the
+ * 64-bit aligned.
+ */
+uint32_t br_i62_modpow_opt_as_i31(uint32_t *x,
+	const unsigned char *e, size_t elen,
+	const uint32_t *m, uint32_t m0i, uint32_t *tmp, size_t twlen);
+
 /* ==================================================================== */
 
 static inline size_t
@@ -1912,6 +1968,15 @@ uint32_t br_rsa_oaep_unpad(const br_hash_class *dig,
 void br_mgf1_xor(void *data, size_t len,
 	const br_hash_class *dig, const void *seed, size_t seed_len);
 
+/*
+ * Inner function for RSA key generation; used by the "i31" and "i62"
+ * implementations.
+ */
+uint32_t br_rsa_i31_keygen_inner(const br_prng_class **rng,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp, br_i31_modpow_opt_type mp31);
+
 /* ==================================================================== */
 /*
  * Elliptic curves.
diff --git a/src/int/i15_moddiv.c b/src/int/i15_moddiv.c
new file mode 100644
index 0000000..45af756
--- /dev/null
+++ b/src/int/i15_moddiv.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * In this file, we handle big integers with a custom format, i.e.
+ * without the usual one-word header. Value is split into 15-bit words,
+ * each stored in a 16-bit slot (top bit is zero) in little-endian
+ * order. The length (in words) is provided explicitly. In some cases,
+ * the value can be negative (using two's complement representation). In
+ * some cases, the top word is allowed to have a 16th bit.
+ */
+
+/*
+ * Negate big integer conditionally. The value consists of 'len' words,
+ * with 15 bits in each word (the top bit of each word should be 0,
+ * except possibly for the last word). If 'ctl' is 1, the negation is
+ * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
+ */
+static void
+cond_negate(uint16_t *a, size_t len, uint32_t ctl)
+{
+	size_t k;
+	uint32_t cc, xm;
+
+	cc = ctl;
+	xm = 0x7FFF & -ctl;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw;
+
+		aw = a[k];
+		aw = (aw ^ xm) + cc;
+		a[k] = aw & 0x7FFF;
+		cc = (aw >> 15) & 1;
+	}
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] may use 16 bits.
+ *
+ * Also, modulus m must be odd.
+ */
+static void
+finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)
+{
+	size_t k;
+	uint32_t cc, xm, ym;
+
+	/*
+	 * First pass: compare a (assumed nonnegative) with m.
+	 */
+	cc = 0;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw, mw;
+
+		aw = a[k];
+		mw = m[k];
+		cc = (aw - mw - cc) >> 31;
+	}
+
+	/*
+	 * At this point:
+	 *   if neg = 1, then we must add m (regardless of cc)
+	 *   if neg = 0 and cc = 0, then we must subtract m
+	 *   if neg = 0 and cc = 1, then we must do nothing
+	 */
+	xm = 0x7FFF & -neg;
+	ym = -(neg | (1 - cc));
+	cc = neg;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw, mw;
+
+		aw = a[k];
+		mw = (m[k] ^ xm) & ym;
+		aw = aw - mw - cc;
+		a[k] = aw & 0x7FFF;
+		cc = aw >> 31;
+	}
+}
+
+/*
+ * Compute:
+ *   a <- (a*pa+b*pb)/(2^15)
+ *   b <- (a*qa+b*qb)/(2^15)
+ * The division is assumed to be exact (i.e. the low word is dropped).
+ * If the final a is negative, then it is negated. Similarly for b.
+ * Returned value is the combination of two bits:
+ *   bit 0: 1 if a had to be negated, 0 otherwise
+ *   bit 1: 1 if b had to be negated, 0 otherwise
+ *
+ * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
+ * Source integers a and b must be nonnegative; top word is not allowed
+ * to contain an extra 16th bit.
+ */
+static uint32_t
+co_reduce(uint16_t *a, uint16_t *b, size_t len,
+	int32_t pa, int32_t pb, int32_t qa, int32_t qb)
+{
+	size_t k;
+	int32_t cca, ccb;
+	uint32_t nega, negb;
+
+	cca = 0;
+	ccb = 0;
+	for (k = 0; k < len; k ++) {
+		uint32_t wa, wb, za, zb;
+		uint16_t tta, ttb;
+
+		/*
+		 * Since:
+		 *   |pa| <= 2^15
+		 *   |pb| <= 2^15
+		 *   0 <= wa <= 2^15 - 1
+		 *   0 <= wb <= 2^15 - 1
+		 *   |cca| <= 2^16 - 1
+		 * Then:
+		 *   |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
+		 *
+		 * Thus, the new value of cca is such that |cca| <= 2^16 - 1.
+		 * The same applies to ccb.
+		 */
+		wa = a[k];
+		wb = b[k];
+		za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
+		zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
+		if (k > 0) {
+			a[k - 1] = za & 0x7FFF;
+			b[k - 1] = zb & 0x7FFF;
+		}
+		tta = za >> 15;
+		ttb = zb >> 15;
+		cca = *(int16_t *)&tta;
+		ccb = *(int16_t *)&ttb;
+	}
+	a[len - 1] = (uint16_t)cca;
+	b[len - 1] = (uint16_t)ccb;
+	nega = (uint32_t)cca >> 31;
+	negb = (uint32_t)ccb >> 31;
+	cond_negate(a, len, nega);
+	cond_negate(b, len, negb);
+	return nega | (negb << 1);
+}
+
+/*
+ * Compute:
+ *   a <- (a*pa+b*pb)/(2^15) mod m
+ *   b <- (a*qa+b*qb)/(2^15) mod m
+ *
+ * m0i is equal to -1/m[0] mod 2^15.
+ *
+ * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
+ * Source integers a and b must be nonnegative; top word is not allowed
+ * to contain an extra 16th bit.
+ */
+static void
+co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,
+	int32_t pa, int32_t pb, int32_t qa, int32_t qb,
+	const uint16_t *m, uint16_t m0i)
+{
+	size_t k;
+	int32_t cca, ccb, fa, fb;
+
+	cca = 0;
+	ccb = 0;
+	fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
+	fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
+	for (k = 0; k < len; k ++) {
+		uint32_t wa, wb, za, zb;
+		uint32_t tta, ttb;
+
+		/*
+		 * In this loop, carries 'cca' and 'ccb' always fit on
+		 * 17 bits (in absolute value).
+		 */
+		wa = a[k];
+		wb = b[k];
+		za = wa * (uint32_t)pa + wb * (uint32_t)pb
+			+ m[k] * (uint32_t)fa + (uint32_t)cca;
+		zb = wa * (uint32_t)qa + wb * (uint32_t)qb
+			+ m[k] * (uint32_t)fb + (uint32_t)ccb;
+		if (k > 0) {
+			a[k - 1] = za & 0x7FFF;
+			b[k - 1] = zb & 0x7FFF;
+		}
+
+		/*
+		 * The XOR-and-sub construction below does an arithmetic
+		 * right shift in a portable way (technically, right-shifting
+		 * a negative signed value is implementation-defined in C).
+		 */
+#define M   ((uint32_t)1 << 16)
+		tta = za >> 15;
+		ttb = zb >> 15;
+		tta = (tta ^ M) - M;
+		ttb = (ttb ^ M) - M;
+		cca = *(int32_t *)&tta;
+		ccb = *(int32_t *)&ttb;
+#undef M
+	}
+	a[len - 1] = (uint32_t)cca;
+	b[len - 1] = (uint32_t)ccb;
+
+	/*
+	 * At this point:
+	 *   -m <= a < 2*m
+	 *   -m <= b < 2*m
+	 * (this is a case of Montgomery reduction)
+	 * The top word of 'a' and 'b' may have a 16-th bit set.
+	 * We may have to add or subtract the modulus.
+	 */
+	finish_mod(a, len, m, (uint32_t)cca >> 31);
+	finish_mod(b, len, m, (uint32_t)ccb >> 31);
+}
+
+/* see inner.h */
+uint32_t
+br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,
+	uint16_t *t)
+{
+	/*
+	 * Algorithm is an extended binary GCD. We maintain four values
+	 * a, b, u and v, with the following invariants:
+	 *
+	 *   a * x = y * u mod m
+	 *   b * x = y * v mod m
+	 *
+	 * Starting values are:
+	 *
+	 *   a = y
+	 *   b = m
+	 *   u = x
+	 *   v = 0
+	 *
+	 * The formal definition of the algorithm is a sequence of steps:
+	 *
+	 *   - If a is even, then a <- a/2 and u <- u/2 mod m.
+	 *   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
+	 *   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
+	 *   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
+	 *
+	 * Algorithm stops when a = b. At that point, they both are equal
+	 * to GCD(y,m); the modular division succeeds if that value is 1.
+	 * The result of the modular division is then u (or v: both are
+	 * equal at that point).
+	 *
+	 * Each step makes either a or b shrink by at least one bit; hence,
+	 * if m has bit length k bits, then 2k-2 steps are sufficient.
+	 *
+	 *
+	 * Though complexity is quadratic in the size of m, the bit-by-bit
+	 * processing is not very efficient. We can speed up processing by
+	 * remarking that the decisions are taken based only on observation
+	 * of the top and low bits of a and b.
+	 *
+	 * In the loop below, at each iteration, we use the two top words
+	 * of a and b, and the low words of a and b, to compute reduction
+	 * parameters pa, pb, qa and qb such that the new values for a
+	 * and b are:
+	 *
+	 *   a' = (a*pa + b*pb) / (2^15)
+	 *   b' = (a*qa + b*qb) / (2^15)
+	 *
+	 * the division being exact.
+	 *
+	 * Since the choices are based on the top words, they may be slightly
+	 * off, requiring an optional correction: if a' < 0, then we replace
+	 * pa with -pa, and pb with -pb. The total length of a and b is
+	 * thus reduced by at least 14 bits at each iteration.
+	 *
+	 * The stopping conditions are still the same, though: when a
+	 * and b become equal, they must be both odd (since m is odd,
+	 * the GCD cannot be even), therefore the next operation is a
+	 * subtraction, and one of the values becomes 0. At that point,
+	 * nothing else happens, i.e. one value is stuck at 0, and the
+	 * other one is the GCD.
+	 */
+	size_t len, k;
+	uint16_t *a, *b, *u, *v;
+	uint32_t num, r;
+
+	len = (m[0] + 15) >> 4;
+	a = t;
+	b = a + len;
+	u = x + 1;
+	v = b + len;
+	memcpy(a, y + 1, len * sizeof *y);
+	memcpy(b, m + 1, len * sizeof *m);
+	memset(v, 0, len * sizeof *v);
+
+	/*
+	 * Loop below ensures that a and b are reduced by some bits each,
+	 * for a total of at least 14 bits.
+	 */
+	for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
+		size_t j;
+		uint32_t c0, c1;
+		uint32_t a0, a1, b0, b1;
+		uint32_t a_hi, b_hi, a_lo, b_lo;
+		int32_t pa, pb, qa, qb;
+		int i;
+
+		/*
+		 * Extract top words of a and b. If j is the highest
+		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
+		 * (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
+		 * If a and b are down to one word each, then we use a[0]
+		 * and b[0].
+		 */
+		c0 = (uint32_t)-1;
+		c1 = (uint32_t)-1;
+		a0 = 0;
+		a1 = 0;
+		b0 = 0;
+		b1 = 0;
+		j = len;
+		while (j -- > 0) {
+			uint32_t aw, bw;
+
+			aw = a[j];
+			bw = b[j];
+			a0 ^= (a0 ^ aw) & c0;
+			a1 ^= (a1 ^ aw) & c1;
+			b0 ^= (b0 ^ bw) & c0;
+			b1 ^= (b1 ^ bw) & c1;
+			c1 = c0;
+			c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;
+		}
+
+		/*
+		 * If c1 = 0, then we grabbed two words for a and b.
+		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
+		 * is not possible that c1 != 0 and c0 != 0, because that
+		 * would mean that both integers are zero.
+		 */
+		a1 |= a0 & c1;
+		a0 &= ~c1;
+		b1 |= b0 & c1;
+		b0 &= ~c1;
+		a_hi = (a0 << 15) + a1;
+		b_hi = (b0 << 15) + b1;
+		a_lo = a[0];
+		b_lo = b[0];
+
+		/*
+		 * Compute reduction factors:
+		 *
+		 *   a' = a*pa + b*pb
+		 *   b' = a*qa + b*qb
+		 *
+		 * such that a' and b' are both multiple of 2^15, but are
+		 * only marginally larger than a and b.
+		 */
+		pa = 1;
+		pb = 0;
+		qa = 0;
+		qb = 1;
+		for (i = 0; i < 15; i ++) {
+			/*
+			 * At each iteration:
+			 *
+			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+			 *   a <- a/2 if: a is even
+			 *   b <- b/2 if: a is odd, b is even
+			 *
+			 * We multiply a_lo and b_lo by 2 at each
+			 * iteration, thus a division by 2 really is a
+			 * non-multiplication by 2.
+			 */
+			uint32_t r, oa, ob, cAB, cBA, cA;
+
+			/*
+			 * cAB = 1 if b must be subtracted from a
+			 * cBA = 1 if a must be subtracted from b
+			 * cA = 1 if a is divided by 2, 0 otherwise
+			 *
+			 * Rules:
+			 *
+			 *   cAB and cBA cannot be both 1.
+			 *   if a is not divided by 2, b is.
+			 */
+			r = GT(a_hi, b_hi);
+			oa = (a_lo >> i) & 1;
+			ob = (b_lo >> i) & 1;
+			cAB = oa & ob & r;
+			cBA = oa & ob & NOT(r);
+			cA = cAB | NOT(oa);
+
+			/*
+			 * Conditional subtractions.
+			 */
+			a_lo -= b_lo & -cAB;
+			a_hi -= b_hi & -cAB;
+			pa -= qa & -(int32_t)cAB;
+			pb -= qb & -(int32_t)cAB;
+			b_lo -= a_lo & -cBA;
+			b_hi -= a_hi & -cBA;
+			qa -= pa & -(int32_t)cBA;
+			qb -= pb & -(int32_t)cBA;
+
+			/*
+			 * Shifting.
+			 */
+			a_lo += a_lo & (cA - 1);
+			pa += pa & ((int32_t)cA - 1);
+			pb += pb & ((int32_t)cA - 1);
+			a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
+			b_lo += b_lo & -cA;
+			qa += qa & -(int32_t)cA;
+			qb += qb & -(int32_t)cA;
+			b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
+		}
+
+		/*
+		 * Replace a and b with new values a' and b'.
+		 */
+		r = co_reduce(a, b, len, pa, pb, qa, qb);
+		pa -= pa * ((r & 1) << 1);
+		pb -= pb * ((r & 1) << 1);
+		qa -= qa * (r & 2);
+		qb -= qb * (r & 2);
+		co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
+	}
+
+	/*
+	 * Now one of the arrays should be 0, and the other contains
+	 * the GCD. If a is 0, then u is 0 as well, and v contains
+	 * the division result.
+	 * Result is correct if and only if GCD is 1.
+	 */
+	r = (a[0] | b[0]) ^ 1;
+	u[0] |= v[0];
+	for (k = 1; k < len; k ++) {
+		r |= a[k] | b[k];
+		u[k] |= v[k];
+	}
+	return EQ0(r);
+}
diff --git a/src/int/i31_moddiv.c b/src/int/i31_moddiv.c
new file mode 100644
index 0000000..9950591
--- /dev/null
+++ b/src/int/i31_moddiv.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * In this file, we handle big integers with a custom format, i.e.
+ * without the usual one-word header. Value is split into 31-bit words,
+ * each stored in a 32-bit slot (top bit is zero) in little-endian
+ * order. The length (in words) is provided explicitly. In some cases,
+ * the value can be negative (using two's complement representation). In
+ * some cases, the top word is allowed to have a 32th bit.
+ */
+
+/*
+ * Negate big integer conditionally. The value consists of 'len' words,
+ * with 31 bits in each word (the top bit of each word should be 0,
+ * except possibly for the last word). If 'ctl' is 1, the negation is
+ * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
+ */
+static void
+cond_negate(uint32_t *a, size_t len, uint32_t ctl)
+{
+	size_t k;
+	uint32_t cc, xm;
+
+	cc = ctl;
+	xm = -ctl >> 1;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw;
+
+		aw = a[k];
+		aw = (aw ^ xm) + cc;
+		a[k] = aw & 0x7FFFFFFF;
+		cc = aw >> 31;
+	}
+}
+
+/*
+ * Finish modular reduction. Rules on input parameters:
+ *
+ *   if neg = 1, then -m <= a < 0
+ *   if neg = 0, then 0 <= a < 2*m
+ *
+ * If neg = 0, then the top word of a[] may use 32 bits.
+ *
+ * Also, modulus m must be odd.
+ */
+static void
+finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg)
+{
+	size_t k;
+	uint32_t cc, xm, ym;
+
+	/*
+	 * First pass: compare a (assumed nonnegative) with m.
+	 * Note that if the final word uses the top extra bit, then
+	 * subtracting m must yield a value less than 2^31, since we
+	 * assumed that a < 2*m.
+	 */
+	cc = 0;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw, mw;
+
+		aw = a[k];
+		mw = m[k];
+		cc = (aw - mw - cc) >> 31;
+	}
+
+	/*
+	 * At this point:
+	 *   if neg = 1, then we must add m (regardless of cc)
+	 *   if neg = 0 and cc = 0, then we must subtract m
+	 *   if neg = 0 and cc = 1, then we must do nothing
+	 */
+	xm = -neg >> 1;
+	ym = -(neg | (1 - cc));
+	cc = neg;
+	for (k = 0; k < len; k ++) {
+		uint32_t aw, mw;
+
+		aw = a[k];
+		mw = (m[k] ^ xm) & ym;
+		aw = aw - mw - cc;
+		a[k] = aw & 0x7FFFFFFF;
+		cc = aw >> 31;
+	}
+}
+
+/*
+ * Compute:
+ *   a <- (a*pa+b*pb)/(2^31)
+ *   b <- (a*qa+b*qb)/(2^31)
+ * The division is assumed to be exact (i.e. the low word is dropped).
+ * If the final a is negative, then it is negated. Similarly for b.
+ * Returned value is the combination of two bits:
+ *   bit 0: 1 if a had to be negated, 0 otherwise
+ *   bit 1: 1 if b had to be negated, 0 otherwise
+ *
+ * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
+ * Source integers a and b must be nonnegative; top word is not allowed
+ * to contain an extra 32th bit.
+ */
+static uint32_t
+co_reduce(uint32_t *a, uint32_t *b, size_t len,
+	int64_t pa, int64_t pb, int64_t qa, int64_t qb)
+{
+	size_t k;
+	int64_t cca, ccb;
+	uint32_t nega, negb;
+
+	cca = 0;
+	ccb = 0;
+	for (k = 0; k < len; k ++) {
+		uint32_t wa, wb;
+		uint64_t za, zb;
+		uint64_t tta, ttb;
+
+		/*
+		 * Since:
+		 *   |pa| <= 2^31
+		 *   |pb| <= 2^31
+		 *   0 <= wa <= 2^31 - 1
+		 *   0 <= wb <= 2^31 - 1
+		 *   |cca| <= 2^32 - 1
+		 * Then:
+		 *   |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1
+		 *
+		 * Thus, the new value of cca is such that |cca| <= 2^32 - 1.
+		 * The same applies to ccb.
+		 */
+		wa = a[k];
+		wb = b[k];
+		za = wa * (uint64_t)pa + wb * (uint64_t)pb + (uint64_t)cca;
+		zb = wa * (uint64_t)qa + wb * (uint64_t)qb + (uint64_t)ccb;
+		if (k > 0) {
+			a[k - 1] = za & 0x7FFFFFFF;
+			b[k - 1] = zb & 0x7FFFFFFF;
+		}
+
+		/*
+		 * For the new values of cca and ccb, we need a signed
+		 * right-shift; since, in C, right-shifting a signed
+		 * negative value is implementation-defined, we use a
+		 * custom portable sign extension expression.
+		 */
+#define M   ((uint64_t)1 << 32)
+		tta = za >> 31;
+		ttb = zb >> 31;
+		tta = (tta ^ M) - M;
+		ttb = (ttb ^ M) - M;
+		cca = *(int64_t *)&tta;
+		ccb = *(int64_t *)&ttb;
+#undef M
+	}
+	a[len - 1] = (uint32_t)cca;
+	b[len - 1] = (uint32_t)ccb;
+
+	nega = (uint32_t)((uint64_t)cca >> 63);
+	negb = (uint32_t)((uint64_t)ccb >> 63);
+	cond_negate(a, len, nega);
+	cond_negate(b, len, negb);
+	return nega | (negb << 1);
+}
+
+/*
+ * Compute:
+ *   a <- (a*pa+b*pb)/(2^31) mod m
+ *   b <- (a*qa+b*qb)/(2^31) mod m
+ *
+ * m0i is equal to -1/m[0] mod 2^31.
+ *
+ * Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
+ * Source integers a and b must be nonnegative; top word is not allowed
+ * to contain an extra 32th bit.
+ */
+static void
+co_reduce_mod(uint32_t *a, uint32_t *b, size_t len,
+	int64_t pa, int64_t pb, int64_t qa, int64_t qb,
+	const uint32_t *m, uint32_t m0i)
+{
+	size_t k;
+	int64_t cca, ccb;
+	uint32_t fa, fb;
+
+	cca = 0;
+	ccb = 0;
+	fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFFFFFF;
+	fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFFFFFF;
+	for (k = 0; k < len; k ++) {
+		uint32_t wa, wb;
+		uint64_t za, zb;
+		uint64_t tta, ttb;
+
+		/*
+		 * In this loop, carries 'cca' and 'ccb' always fit on
+		 * 33 bits (in absolute value).
+		 */
+		wa = a[k];
+		wb = b[k];
+		za = wa * (uint64_t)pa + wb * (uint64_t)pb
+			+ m[k] * (uint64_t)fa + (uint64_t)cca;
+		zb = wa * (uint64_t)qa + wb * (uint64_t)qb
+			+ m[k] * (uint64_t)fb + (uint64_t)ccb;
+		if (k > 0) {
+			a[k - 1] = (uint32_t)za & 0x7FFFFFFF;
+			b[k - 1] = (uint32_t)zb & 0x7FFFFFFF;
+		}
+
+#define M   ((uint64_t)1 << 32)
+		tta = za >> 31;
+		ttb = zb >> 31;
+		tta = (tta ^ M) - M;
+		ttb = (ttb ^ M) - M;
+		cca = *(int64_t *)&tta;
+		ccb = *(int64_t *)&ttb;
+#undef M
+	}
+	a[len - 1] = (uint32_t)cca;
+	b[len - 1] = (uint32_t)ccb;
+
+	/*
+	 * At this point:
+	 *   -m <= a < 2*m
+	 *   -m <= b < 2*m
+	 * (this is a case of Montgomery reduction)
+	 * The top word of 'a' and 'b' may have a 32-th bit set.
+	 * We may have to add or subtract the modulus.
+	 */
+	finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63));
+	finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63));
+}
+
+/* see inner.h */
+uint32_t
+br_i31_moddiv(uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i,
+	uint32_t *t)
+{
+	/*
+	 * Algorithm is an extended binary GCD. We maintain four values
+	 * a, b, u and v, with the following invariants:
+	 *
+	 *   a * x = y * u mod m
+	 *   b * x = y * v mod m
+	 *
+	 * Starting values are:
+	 *
+	 *   a = y
+	 *   b = m
+	 *   u = x
+	 *   v = 0
+	 *
+	 * The formal definition of the algorithm is a sequence of steps:
+	 *
+	 *   - If a is even, then a <- a/2 and u <- u/2 mod m.
+	 *   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
+	 *   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
+	 *   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
+	 *
+	 * Algorithm stops when a = b. At that point, they both are equal
+	 * to GCD(y,m); the modular division succeeds if that value is 1.
+	 * The result of the modular division is then u (or v: both are
+	 * equal at that point).
+	 *
+	 * Each step makes either a or b shrink by at least one bit; hence,
+	 * if m has bit length k bits, then 2k-2 steps are sufficient.
+	 *
+	 *
+	 * Though complexity is quadratic in the size of m, the bit-by-bit
+	 * processing is not very efficient. We can speed up processing by
+	 * remarking that the decisions are taken based only on observation
+	 * of the top and low bits of a and b.
+	 *
+	 * In the loop below, at each iteration, we use the two top words
+	 * of a and b, and the low words of a and b, to compute reduction
+	 * parameters pa, pb, qa and qb such that the new values for a
+	 * and b are:
+	 *
+	 *   a' = (a*pa + b*pb) / (2^31)
+	 *   b' = (a*qa + b*qb) / (2^31)
+	 *
+	 * the division being exact.
+	 *
+	 * Since the choices are based on the top words, they may be slightly
+	 * off, requiring an optional correction: if a' < 0, then we replace
+	 * pa with -pa, and pb with -pb. The total length of a and b is
+	 * thus reduced by at least 30 bits at each iteration.
+	 *
+	 * The stopping conditions are still the same, though: when a
+	 * and b become equal, they must be both odd (since m is odd,
+	 * the GCD cannot be even), therefore the next operation is a
+	 * subtraction, and one of the values becomes 0. At that point,
+	 * nothing else happens, i.e. one value is stuck at 0, and the
+	 * other one is the GCD.
+	 */
+	size_t len, k;
+	uint32_t *a, *b, *u, *v;
+	uint32_t num, r;
+
+	len = (m[0] + 31) >> 5;
+	a = t;
+	b = a + len;
+	u = x + 1;
+	v = b + len;
+	memcpy(a, y + 1, len * sizeof *y);
+	memcpy(b, m + 1, len * sizeof *m);
+	memset(v, 0, len * sizeof *v);
+
+	/*
+	 * Loop below ensures that a and b are reduced by some bits each,
+	 * for a total of at least 30 bits.
+	 */
+	for (num = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) {
+		size_t j;
+		uint32_t c0, c1;
+		uint32_t a0, a1, b0, b1;
+		uint64_t a_hi, b_hi;
+		uint32_t a_lo, b_lo;
+		int64_t pa, pb, qa, qb;
+		int i;
+
+		/*
+		 * Extract top words of a and b. If j is the highest
+		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
+		 * (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].
+		 * If a and b are down to one word each, then we use a[0]
+		 * and b[0].
+		 */
+		c0 = (uint32_t)-1;
+		c1 = (uint32_t)-1;
+		a0 = 0;
+		a1 = 0;
+		b0 = 0;
+		b1 = 0;
+		j = len;
+		while (j -- > 0) {
+			uint32_t aw, bw;
+
+			aw = a[j];
+			bw = b[j];
+			a0 ^= (a0 ^ aw) & c0;
+			a1 ^= (a1 ^ aw) & c1;
+			b0 ^= (b0 ^ bw) & c0;
+			b1 ^= (b1 ^ bw) & c1;
+			c1 = c0;
+			c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1;
+		}
+
+		/*
+		 * If c1 = 0, then we grabbed two words for a and b.
+		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
+		 * is not possible that c1 != 0 and c0 != 0, because that
+		 * would mean that both integers are zero.
+		 */
+		a1 |= a0 & c1;
+		a0 &= ~c1;
+		b1 |= b0 & c1;
+		b0 &= ~c1;
+		a_hi = ((uint64_t)a0 << 31) + a1;
+		b_hi = ((uint64_t)b0 << 31) + b1;
+		a_lo = a[0];
+		b_lo = b[0];
+
+		/*
+		 * Compute reduction factors:
+		 *
+		 *   a' = a*pa + b*pb
+		 *   b' = a*qa + b*qb
+		 *
+		 * such that a' and b' are both multiple of 2^31, but are
+		 * only marginally larger than a and b.
+		 */
+		pa = 1;
+		pb = 0;
+		qa = 0;
+		qb = 1;
+		for (i = 0; i < 31; i ++) {
+			/*
+			 * At each iteration:
+			 *
+			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
+			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
+			 *   a <- a/2 if: a is even
+			 *   b <- b/2 if: a is odd, b is even
+			 *
+			 * We multiply a_lo and b_lo by 2 at each
+			 * iteration, thus a division by 2 really is a
+			 * non-multiplication by 2.
+			 */
+			uint32_t r, oa, ob, cAB, cBA, cA;
+			uint64_t rz;
+
+			/*
+			 * r = GT(a_hi, b_hi)
+			 * But the GT() function works on uint32_t operands,
+			 * so we inline a 64-bit version here.
+			 */
+			rz = b_hi - a_hi;
+			r = (uint32_t)((rz ^ ((a_hi ^ b_hi)
+				& (a_hi ^ rz))) >> 63);
+
+			/*
+			 * cAB = 1 if b must be subtracted from a
+			 * cBA = 1 if a must be subtracted from b
+			 * cA = 1 if a is divided by 2, 0 otherwise
+			 *
+			 * Rules:
+			 *
+			 *   cAB and cBA cannot be both 1.
+			 *   if a is not divided by 2, b is.
+			 */
+			oa = (a_lo >> i) & 1;
+			ob = (b_lo >> i) & 1;
+			cAB = oa & ob & r;
+			cBA = oa & ob & NOT(r);
+			cA = cAB | NOT(oa);
+
+			/*
+			 * Conditional subtractions.
+			 */
+			a_lo -= b_lo & -cAB;
+			a_hi -= b_hi & -(uint64_t)cAB;
+			pa -= qa & -(int64_t)cAB;
+			pb -= qb & -(int64_t)cAB;
+			b_lo -= a_lo & -cBA;
+			b_hi -= a_hi & -(uint64_t)cBA;
+			qa -= pa & -(int64_t)cBA;
+			qb -= pb & -(int64_t)cBA;
+
+			/*
+			 * Shifting.
+			 */
+			a_lo += a_lo & (cA - 1);
+			pa += pa & ((int64_t)cA - 1);
+			pb += pb & ((int64_t)cA - 1);
+			a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA;
+			b_lo += b_lo & -cA;
+			qa += qa & -(int64_t)cA;
+			qb += qb & -(int64_t)cA;
+			b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1);
+		}
+
+		/*
+		 * Replace a and b with new values a' and b'.
+		 */
+		r = co_reduce(a, b, len, pa, pb, qa, qb);
+		pa -= pa * ((r & 1) << 1);
+		pb -= pb * ((r & 1) << 1);
+		qa -= qa * (r & 2);
+		qb -= qb * (r & 2);
+		co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
+	}
+
+	/*
+	 * Now one of the arrays should be 0, and the other contains
+	 * the GCD. If a is 0, then u is 0 as well, and v contains
+	 * the division result.
+	 * Result is correct if and only if GCD is 1.
+	 */
+	r = (a[0] | b[0]) ^ 1;
+	u[0] |= v[0];
+	for (k = 1; k < len; k ++) {
+		r |= a[k] | b[k];
+		u[k] |= v[k];
+	}
+	return EQ0(r);
+}
diff --git a/src/int/i62_modpow2.c b/src/int/i62_modpow2.c
index 0414e5f..2db537f 100644
--- a/src/int/i62_modpow2.c
+++ b/src/int/i62_modpow2.c
@@ -244,7 +244,7 @@ br_i62_modpow_opt(uint32_t *x31, const unsigned char *e, size_t elen,
 	mw62num = (mw31num + 1) >> 1;
 
 	/*
-	 * In order to apply this function, we must have enough room tp
+	 * In order to apply this function, we must have enough room to
 	 * copy the operand and modulus into the temporary array, along
 	 * with at least two temporaries. If there is not enough room,
 	 * switch to br_i31_modpow(). We also use br_i31_modpow() if the
@@ -477,3 +477,17 @@ br_i62_modpow_opt(uint32_t *x31, const unsigned char *e, size_t elen,
 }
 
 #endif
+
+/* see inner.h */
+uint32_t
+br_i62_modpow_opt_as_i31(uint32_t *x31, const unsigned char *e, size_t elen,
+	const uint32_t *m31, uint32_t m0i31, uint32_t *tmp, size_t twlen)
+{
+	/*
+	 * As documented, this function expects the 'tmp' argument to be
+	 * 64-bit aligned. This is OK since this function is internal (it
+	 * is not part of BearSSL's public API).
+	 */
+	return br_i62_modpow_opt(x31, e, elen, m31, m0i31,
+		(uint64_t *)tmp, twlen >> 1);
+}
diff --git a/src/rsa/rsa_default_keygen.c b/src/rsa/rsa_default_keygen.c
new file mode 100644
index 0000000..f2e83c8
--- /dev/null
+++ b/src/rsa/rsa_default_keygen.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+br_rsa_keygen
+br_rsa_keygen_get_default(void)
+{
+#if BR_INT128 || BR_UMUL128
+	return &br_rsa_i62_keygen;
+#elif BR_LOMUL
+	return &br_rsa_i15_keygen;
+#else
+	return &br_rsa_i31_keygen;
+#endif
+}
diff --git a/src/rsa/rsa_i15_keygen.c b/src/rsa/rsa_i15_keygen.c
new file mode 100644
index 0000000..0f4435f
--- /dev/null
+++ b/src/rsa/rsa_i15_keygen.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Make a random integer of the provided size. The size is encoded.
+ * The header word is untouched.
+ */
+static void
+mkrand(const br_prng_class **rng, uint16_t *x, uint32_t esize)
+{
+	size_t u, len;
+	unsigned m;
+
+	len = (esize + 15) >> 4;
+	(*rng)->generate(rng, x + 1, len * sizeof(uint16_t));
+	for (u = 1; u < len; u ++) {
+		x[u] &= 0x7FFF;
+	}
+	m = esize & 15;
+	if (m == 0) {
+		x[len] &= 0x7FFF;
+	} else {
+		x[len] &= 0x7FFF >> (15 - m);
+	}
+}
+
+/*
+ * This is the big-endian unsigned representation of the product of
+ * all small primes from 13 to 1481.
+ */
+static const unsigned char SMALL_PRIMES[] = {
+	0x2E, 0xAB, 0x92, 0xD1, 0x8B, 0x12, 0x47, 0x31, 0x54, 0x0A,
+	0x99, 0x5D, 0x25, 0x5E, 0xE2, 0x14, 0x96, 0x29, 0x1E, 0xB7,
+	0x78, 0x70, 0xCC, 0x1F, 0xA5, 0xAB, 0x8D, 0x72, 0x11, 0x37,
+	0xFB, 0xD8, 0x1E, 0x3F, 0x5B, 0x34, 0x30, 0x17, 0x8B, 0xE5,
+	0x26, 0x28, 0x23, 0xA1, 0x8A, 0xA4, 0x29, 0xEA, 0xFD, 0x9E,
+	0x39, 0x60, 0x8A, 0xF3, 0xB5, 0xA6, 0xEB, 0x3F, 0x02, 0xB6,
+	0x16, 0xC3, 0x96, 0x9D, 0x38, 0xB0, 0x7D, 0x82, 0x87, 0x0C,
+	0xF7, 0xBE, 0x24, 0xE5, 0x5F, 0x41, 0x04, 0x79, 0x76, 0x40,
+	0xE7, 0x00, 0x22, 0x7E, 0xB5, 0x85, 0x7F, 0x8D, 0x01, 0x50,
+	0xE9, 0xD3, 0x29, 0x42, 0x08, 0xB3, 0x51, 0x40, 0x7B, 0xD7,
+	0x8D, 0xCC, 0x10, 0x01, 0x64, 0x59, 0x28, 0xB6, 0x53, 0xF3,
+	0x50, 0x4E, 0xB1, 0xF2, 0x58, 0xCD, 0x6E, 0xF5, 0x56, 0x3E,
+	0x66, 0x2F, 0xD7, 0x07, 0x7F, 0x52, 0x4C, 0x13, 0x24, 0xDC,
+	0x8E, 0x8D, 0xCC, 0xED, 0x77, 0xC4, 0x21, 0xD2, 0xFD, 0x08,
+	0xEA, 0xD7, 0xC0, 0x5C, 0x13, 0x82, 0x81, 0x31, 0x2F, 0x2B,
+	0x08, 0xE4, 0x80, 0x04, 0x7A, 0x0C, 0x8A, 0x3C, 0xDC, 0x22,
+	0xE4, 0x5A, 0x7A, 0xB0, 0x12, 0x5E, 0x4A, 0x76, 0x94, 0x77,
+	0xC2, 0x0E, 0x92, 0xBA, 0x8A, 0xA0, 0x1F, 0x14, 0x51, 0x1E,
+	0x66, 0x6C, 0x38, 0x03, 0x6C, 0xC7, 0x4A, 0x4B, 0x70, 0x80,
+	0xAF, 0xCA, 0x84, 0x51, 0xD8, 0xD2, 0x26, 0x49, 0xF5, 0xA8,
+	0x5E, 0x35, 0x4B, 0xAC, 0xCE, 0x29, 0x92, 0x33, 0xB7, 0xA2,
+	0x69, 0x7D, 0x0C, 0xE0, 0x9C, 0xDB, 0x04, 0xD6, 0xB4, 0xBC,
+	0x39, 0xD7, 0x7F, 0x9E, 0x9D, 0x78, 0x38, 0x7F, 0x51, 0x54,
+	0x50, 0x8B, 0x9E, 0x9C, 0x03, 0x6C, 0xF5, 0x9D, 0x2C, 0x74,
+	0x57, 0xF0, 0x27, 0x2A, 0xC3, 0x47, 0xCA, 0xB9, 0xD7, 0x5C,
+	0xFF, 0xC2, 0xAC, 0x65, 0x4E, 0xBD
+};
+
+/*
+ * We need temporary values for at least 7 integers of the same size
+ * as a factor (including header word); more space helps with performance
+ * (in modular exponentiations), but we much prefer to remain under
+ * 2 kilobytes in total, to save stack space. The macro TEMPS below
+ * exceeds 1024 (which is a count in 16-bit words) when BR_MAX_RSA_SIZE
+ * is greater than 4350 (default value is 4096, so the 2-kB limit is
+ * maintained unless BR_MAX_RSA_SIZE was modified).
+ */
+#define MAX(x, y)   ((x) > (y) ? (x) : (y))
+#define TEMPS       MAX(1024, 7 * ((((BR_MAX_RSA_SIZE + 1) >> 1) + 29) / 15))
+
+/*
+ * Perform trial division on a candidate prime. This computes
+ * y = SMALL_PRIMES mod x, then tries to compute y/y mod x. The
+ * br_i15_moddiv() function will report an error if y is not invertible
+ * modulo x. Returned value is 1 on success (none of the small primes
+ * divides x), 0 on error (a non-trivial GCD is obtained).
+ *
+ * This function assumes that x is odd.
+ */
+static uint32_t
+trial_divisions(const uint16_t *x, uint16_t *t)
+{
+	uint16_t *y;
+	uint16_t x0i;
+
+	y = t;
+	t += 1 + ((x[0] + 15) >> 4);
+	x0i = br_i15_ninv15(x[1]);
+	br_i15_decode_reduce(y, SMALL_PRIMES, sizeof SMALL_PRIMES, x);
+	return br_i15_moddiv(y, y, x, x0i, t);
+}
+
+/*
+ * Perform n rounds of Miller-Rabin on the candidate prime x. This
+ * function assumes that x = 3 mod 4.
+ *
+ * Returned value is 1 on success (all rounds completed successfully),
+ * 0 otherwise.
+ */
+static uint32_t
+miller_rabin(const br_prng_class **rng, const uint16_t *x, int n,
+	uint16_t *t, size_t tlen)
+{
+	/*
+	 * Since x = 3 mod 4, the Miller-Rabin test is simple:
+	 *  - get a random base a (such that 1 < a < x-1)
+	 *  - compute z = a^((x-1)/2) mod x
+	 *  - if z != 1 and z != x-1, the number x is composite
+	 *
+	 * We generate bases 'a' randomly with a size which is
+	 * one bit less than x, which ensures that a < x-1. It
+	 * is not useful to verify that a > 1 because the probability
+	 * that we get a value a equal to 0 or 1 is much smaller
+	 * than the probability of our Miller-Rabin tests not to
+	 * detect a composite, which is already quite smaller than the
+	 * probability of the hardware misbehaving and return a
+	 * composite integer because of some glitch (e.g. bad RAM
+	 * or ill-timed cosmic ray).
+	 */
+	unsigned char *xm1d2;
+	size_t xlen, xm1d2_len, xm1d2_len_u16, u;
+	uint32_t asize;
+	unsigned cc;
+	uint16_t x0i;
+
+	/*
+	 * Compute (x-1)/2 (encoded).
+	 */
+	xm1d2 = (unsigned char *)t;
+	xm1d2_len = ((x[0] - (x[0] >> 4)) + 7) >> 3;
+	br_i15_encode(xm1d2, xm1d2_len, x);
+	cc = 0;
+	for (u = 0; u < xm1d2_len; u ++) {
+		unsigned w;
+
+		w = xm1d2[u];
+		xm1d2[u] = (unsigned char)((w >> 1) | cc);
+		cc = w << 7;
+	}
+
+	/*
+	 * We used some words of the provided buffer for (x-1)/2.
+	 */
+	xm1d2_len_u16 = (xm1d2_len + 1) >> 1;
+	t += xm1d2_len_u16;
+	tlen -= xm1d2_len_u16;
+
+	xlen = (x[0] + 15) >> 4;
+	asize = x[0] - 1 - EQ0(x[0] & 15);
+	x0i = br_i15_ninv15(x[1]);
+	while (n -- > 0) {
+		uint16_t *a;
+		uint32_t eq1, eqm1;
+
+		/*
+		 * Generate a random base. We don't need the base to be
+		 * really uniform modulo x, so we just get a random
+		 * number which is one bit shorter than x.
+		 */
+		a = t;
+		a[0] = x[0];
+		a[xlen] = 0;
+		mkrand(rng, a, asize);
+
+		/*
+		 * Compute a^((x-1)/2) mod x. We assume here that the
+		 * function will not fail (the temporary array is large
+		 * enough).
+		 */
+		br_i15_modpow_opt(a, xm1d2, xm1d2_len,
+			x, x0i, t + 1 + xlen, tlen - 1 - xlen);
+
+		/*
+		 * We must obtain either 1 or x-1. Note that x is odd,
+		 * hence x-1 differs from x only in its low word (no
+		 * carry).
+		 */
+		eq1 = a[1] ^ 1;
+		eqm1 = a[1] ^ (x[1] - 1);
+		for (u = 2; u <= xlen; u ++) {
+			eq1 |= a[u];
+			eqm1 |= a[u] ^ x[u];
+		}
+
+		if ((EQ0(eq1) | EQ0(eqm1)) == 0) {
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Create a random prime of the provided size. 'size' is the _encoded_
+ * bit length. The two top bits and the two bottom bits are set to 1.
+ */
+static void
+mkprime(const br_prng_class **rng, uint16_t *x, uint32_t esize,
+	uint32_t pubexp, uint16_t *t, size_t tlen)
+{
+	size_t len;
+
+	x[0] = esize;
+	len = (esize + 15) >> 4;
+	for (;;) {
+		size_t u;
+		uint32_t m3, m5, m7, m11;
+		int rounds;
+
+		/*
+		 * Generate random bits. We force the two top bits and the
+		 * two bottom bits to 1.
+		 */
+		mkrand(rng, x, esize);
+		if ((esize & 15) == 0) {
+			x[len] |= 0x6000;
+		} else if ((esize & 15) == 1) {
+			x[len] |= 0x0001;
+			x[len - 1] |= 0x4000;
+		} else {
+			x[len] |= 0x0003 << ((esize & 15) - 2);
+		}
+		x[1] |= 0x0003;
+
+		/*
+		 * Trial division with low primes (3, 5, 7 and 11). We
+		 * use the following properties:
+		 *
+		 *   2^2 = 1 mod 3
+		 *   2^4 = 1 mod 5
+		 *   2^3 = 1 mod 7
+		 *   2^10 = 1 mod 11
+		 */
+		m3 = 0;
+		m5 = 0;
+		m7 = 0;
+		m11 = 0;
+		for (u = 0; u < len; u ++) {
+			uint32_t w;
+
+			w = x[1 + u];
+			m3 += w << (u & 1);
+			m3 = (m3 & 0xFF) + (m3 >> 8);
+			m5 += w << ((4 - u) & 3);
+			m5 = (m5 & 0xFF) + (m5 >> 8);
+			m7 += w;
+			m7 = (m7 & 0x1FF) + (m7 >> 9);
+			m11 += w << (5 & -(u & 1));
+			m11 = (m11 & 0x3FF) + (m11 >> 10);
+		}
+
+		/*
+		 * Maximum values of m* at this point:
+		 *  m3:   511
+		 *  m5:   2310
+		 *  m7:   510
+		 *  m11:  2047
+		 * We use the same properties to make further reductions.
+		 */
+
+		m3 = (m3 & 0x0F) + (m3 >> 4);      /* max: 46 */
+		m3 = (m3 & 0x0F) + (m3 >> 4);      /* max: 16 */
+		m3 = ((m3 * 43) >> 5) & 3;
+
+		m5 = (m5 & 0xFF) + (m5 >> 8);      /* max: 263 */
+		m5 = (m5 & 0x0F) + (m5 >> 4);      /* max: 30 */
+		m5 = (m5 & 0x0F) + (m5 >> 4);      /* max: 15 */
+		m5 -= 10 & -GT(m5, 9);
+		m5 -= 5 & -GT(m5, 4);
+
+		m7 = (m7 & 0x3F) + (m7 >> 6);      /* max: 69 */
+		m7 = (m7 & 7) + (m7 >> 3);         /* max: 14 */
+		m7 = ((m7 * 147) >> 7) & 7;
+
+		/*
+		 * 2^5 = 32 = -1 mod 11.
+		 */
+		m11 = (m11 & 0x1F) + 66 - (m11 >> 5);   /* max: 97 */
+		m11 -= 88 & -GT(m11, 87);
+		m11 -= 44 & -GT(m11, 43);
+		m11 -= 22 & -GT(m11, 21);
+		m11 -= 11 & -GT(m11, 10);
+
+		/*
+		 * If any of these modulo is 0, then the candidate is
+		 * not prime. Also, if pubexp is 3, 5, 7 or 11, and the
+		 * corresponding modulus is 1, then the candidate must
+		 * be rejected, because we need e to be invertible
+		 * modulo p-1. We can use simple comparisons here
+		 * because they won't leak information on a candidate
+		 * that we keep, only on one that we reject (and is thus
+		 * not secret).
+		 */
+		if (m3 == 0 || m5 == 0 || m7 == 0 || m11 == 0) {
+			continue;
+		}
+		if ((pubexp == 3 && m3 == 1)
+			|| (pubexp == 5 && m5 == 5)
+			|| (pubexp == 7 && m5 == 7)
+			|| (pubexp == 11 && m5 == 11))
+		{
+			continue;
+		}
+
+		/*
+		 * More trial divisions.
+		 */
+		if (!trial_divisions(x, t)) {
+			continue;
+		}
+
+		/*
+		 * Miller-Rabin algorithm. Since we selected a random
+		 * integer, not a maliciously crafted integer, we can use
+		 * relatively few rounds to lower the risk of a false
+		 * positive (i.e. declaring prime a non-prime) under
+		 * 2^(-80). It is not useful to lower the probability much
+		 * below that, since that would be substantially below
+		 * the probability of the hardware misbehaving. Sufficient
+		 * numbers of rounds are extracted from the Handbook of
+		 * Applied Cryptography, note 4.49 (page 149).
+		 *
+		 * Since we work on the encoded size (esize), we need to
+		 * compare with encoded thresholds.
+		 */
+		if (esize < 320) {
+			rounds = 12;
+		} else if (esize < 480) {
+			rounds = 9;
+		} else if (esize < 693) {
+			rounds = 6;
+		} else if (esize < 906) {
+			rounds = 4;
+		} else if (esize < 1386) {
+			rounds = 3;
+		} else {
+			rounds = 2;
+		}
+
+		if (miller_rabin(rng, x, rounds, t, tlen)) {
+			return;
+		}
+	}
+}
+
+/*
+ * Let p be a prime (p > 2^33, p = 3 mod 4). Let m = (p-1)/2, provided
+ * as parameter (with announced bit length equal to that of p). This
+ * function computes d = 1/e mod p-1 (for an odd integer e). Returned
+ * value is 1 on success, 0 on error (an error is reported if e is not
+ * invertible modulo p-1).
+ *
+ * The temporary buffer (t) must have room for at least 4 integers of
+ * the size of p.
+ */
+static uint32_t
+invert_pubexp(uint16_t *d, const uint16_t *m, uint32_t e, uint16_t *t)
+{
+	uint16_t *f;
+	uint32_t r;
+
+	f = t;
+	t += 1 + ((m[0] + 15) >> 4);
+
+	/*
+	 * Compute d = 1/e mod m. Since p = 3 mod 4, m is odd.
+	 */
+	br_i15_zero(d, m[0]);
+	d[1] = 1;
+	br_i15_zero(f, m[0]);
+	f[1] = e & 0x7FFF;
+	f[2] = (e >> 15) & 0x7FFF;
+	f[3] = e >> 30;
+	r = br_i15_moddiv(d, f, m, br_i15_ninv15(m[1]), t);
+
+	/*
+	 * We really want d = 1/e mod p-1, with p = 2m. By the CRT,
+	 * the result is either the d we got, or d + m.
+	 *
+	 * Let's write e*d = 1 + k*m, for some integer k. Integers e
+	 * and m are odd. If d is odd, then e*d is odd, which implies
+	 * that k must be even; in that case, e*d = 1 + (k/2)*2m, and
+	 * thus d is already fine. Conversely, if d is even, then k
+	 * is odd, and we must add m to d in order to get the correct
+	 * result.
+	 */
+	br_i15_add(d, m, (uint32_t)(1 - (d[1] & 1)));
+
+	return r;
+}
+
+/*
+ * Swap two buffers in RAM. They must be disjoint.
+ */
+static void
+bufswap(void *b1, void *b2, size_t len)
+{
+	size_t u;
+	unsigned char *buf1, *buf2;
+
+	buf1 = b1;
+	buf2 = b2;
+	for (u = 0; u < len; u ++) {
+		unsigned w;
+
+		w = buf1[u];
+		buf1[u] = buf2[u];
+		buf2[u] = w;
+	}
+}
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i15_keygen(const br_prng_class **rng,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp)
+{
+	uint32_t esize_p, esize_q;
+	size_t plen, qlen, tlen;
+	uint16_t *p, *q, *t;
+	uint16_t tmp[TEMPS];
+	uint32_t r;
+
+	if (size < BR_MIN_RSA_SIZE || size > BR_MAX_RSA_SIZE) {
+		return 0;
+	}
+	if (pubexp == 0) {
+		pubexp = 3;
+	} else if (pubexp == 1 || (pubexp & 1) == 0) {
+		return 0;
+	}
+
+	esize_p = (size + 1) >> 1;
+	esize_q = size - esize_p;
+	sk->n_bitlen = size;
+	sk->p = kbuf_priv;
+	sk->plen = (esize_p + 7) >> 3;
+	sk->q = sk->p + sk->plen;
+	sk->qlen = (esize_q + 7) >> 3;
+	sk->dp = sk->q + sk->qlen;
+	sk->dplen = sk->plen;
+	sk->dq = sk->dp + sk->dplen;
+	sk->dqlen = sk->qlen;
+	sk->iq = sk->dq + sk->dqlen;
+	sk->iqlen = sk->plen;
+
+	if (pk != NULL) {
+		pk->n = kbuf_pub;
+		pk->nlen = (size + 7) >> 3;
+		pk->e = pk->n + pk->nlen;
+		pk->elen = 4;
+		br_enc32be(pk->e, pubexp);
+		while (*pk->e == 0) {
+			pk->e ++;
+			pk->elen --;
+		}
+	}
+
+	/*
+	 * We now switch to encoded sizes.
+	 *
+	 * floor((x * 17477) / (2^18)) is equal to floor(x/15) for all
+	 * integers x from 0 to 23833.
+	 */
+	esize_p += MUL15(esize_p, 17477) >> 18;
+	esize_q += MUL15(esize_q, 17477) >> 18;
+	plen = (esize_p + 15) >> 4;
+	qlen = (esize_q + 15) >> 4;
+	p = tmp;
+	q = p + 1 + plen;
+	t = q + 1 + qlen;
+	tlen = ((sizeof tmp) / sizeof(uint16_t)) - (2 + plen + qlen);
+
+	/*
+	 * When looking for primes p and q, we temporarily divide
+	 * candidates by 2, in order to compute the inverse of the
+	 * public exponent.
+	 */
+
+	for (;;) {
+		mkprime(rng, p, esize_p, pubexp, t, tlen);
+		br_i15_rshift(p, 1);
+		if (invert_pubexp(t, p, pubexp, t + 1 + plen)) {
+			br_i15_add(p, p, 1);
+			p[1] |= 1;
+			br_i15_encode(sk->p, sk->plen, p);
+			br_i15_encode(sk->dp, sk->dplen, t);
+			break;
+		}
+	}
+
+	for (;;) {
+		mkprime(rng, q, esize_q, pubexp, t, tlen);
+		br_i15_rshift(q, 1);
+		if (invert_pubexp(t, q, pubexp, t + 1 + qlen)) {
+			br_i15_add(q, q, 1);
+			q[1] |= 1;
+			br_i15_encode(sk->q, sk->qlen, q);
+			br_i15_encode(sk->dq, sk->dqlen, t);
+			break;
+		}
+	}
+
+	/*
+	 * If p and q have the same size, then it is possible that q > p
+	 * (when the target modulus size is odd, we generate p with a
+	 * greater bit length than q). If q > p, we want to swap p and q
+	 * (and also dp and dq) for two reasons:
+	 *  - The final step below (inversion of q modulo p) is easier if
+	 *    p > q.
+	 *  - While BearSSL's RSA code is perfectly happy with RSA keys such
+	 *    that p < q, some other implementations have restrictions and
+	 *    require p > q.
+	 *
+	 * Note that we can do a simple non-constant-time swap here,
+	 * because the only information we leak here is that we insist on
+	 * returning p and q such that p > q, which is not a secret.
+	 */
+	if (esize_p == esize_q && br_i15_sub(p, q, 0) == 1) {
+		bufswap(p, q, (1 + plen) * sizeof *p);
+		bufswap(sk->p, sk->q, sk->plen);
+		bufswap(sk->dp, sk->dq, sk->dplen);
+	}
+
+	/*
+	 * We have produced p, q, dp and dq. We can now compute iq = 1/d mod p.
+	 *
+	 * We ensured that p >= q, so this is just a matter of updating the
+	 * header word for q (and possibly adding an extra word).
+	 *
+	 * Theoretically, the call below may fail, in case we were
+	 * extraordinarily unlucky, and p = q. Another failure case is if
+	 * Miller-Rabin failed us _twice_, and p and q are non-prime and
+	 * have a factor is common. We report the error mostly because it
+	 * is cheap and we can, but in practice this never happens (or, at
+	 * least, it happens way less often than hardware glitches).
+	 */
+	q[0] = p[0];
+	if (plen > qlen) {
+		q[plen] = 0;
+		t ++;
+		tlen --;
+	}
+	br_i15_zero(t, p[0]);
+	t[1] = 1;
+	r = br_i15_moddiv(t, q, p, br_i15_ninv15(p[1]), t + 1 + plen);
+	br_i15_encode(sk->iq, sk->iqlen, t);
+
+	/*
+	 * Compute the public modulus too, if required.
+	 */
+	if (pk != NULL) {
+		br_i15_zero(t, p[0]);
+		br_i15_mulacc(t, p, q);
+		br_i15_encode(pk->n, pk->nlen, t);
+	}
+
+	return r;
+}
diff --git a/src/rsa/rsa_i31_keygen.c b/src/rsa/rsa_i31_keygen.c
new file mode 100644
index 0000000..0e26246
--- /dev/null
+++ b/src/rsa/rsa_i31_keygen.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i31_keygen(const br_prng_class **rng,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp)
+{
+	return br_rsa_i31_keygen_inner(rng,
+		sk, kbuf_priv, pk, kbuf_pub, size, pubexp,
+		&br_i31_modpow_opt);
+}
diff --git a/src/rsa/rsa_i31_keygen_inner.c b/src/rsa/rsa_i31_keygen_inner.c
new file mode 100644
index 0000000..69120e7
--- /dev/null
+++ b/src/rsa/rsa_i31_keygen_inner.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Make a random integer of the provided size. The size is encoded.
+ * The header word is untouched.
+ */
+static void
+mkrand(const br_prng_class **rng, uint32_t *x, uint32_t esize)
+{
+	size_t u, len;
+	unsigned m;
+
+	len = (esize + 31) >> 5;
+	(*rng)->generate(rng, x + 1, len * sizeof(uint32_t));
+	for (u = 1; u < len; u ++) {
+		x[u] &= 0x7FFFFFFF;
+	}
+	m = esize & 31;
+	if (m == 0) {
+		x[len] &= 0x7FFFFFFF;
+	} else {
+		x[len] &= 0x7FFFFFFF >> (31 - m);
+	}
+}
+
+/*
+ * This is the big-endian unsigned representation of the product of
+ * all small primes from 13 to 1481.
+ */
+static const unsigned char SMALL_PRIMES[] = {
+	0x2E, 0xAB, 0x92, 0xD1, 0x8B, 0x12, 0x47, 0x31, 0x54, 0x0A,
+	0x99, 0x5D, 0x25, 0x5E, 0xE2, 0x14, 0x96, 0x29, 0x1E, 0xB7,
+	0x78, 0x70, 0xCC, 0x1F, 0xA5, 0xAB, 0x8D, 0x72, 0x11, 0x37,
+	0xFB, 0xD8, 0x1E, 0x3F, 0x5B, 0x34, 0x30, 0x17, 0x8B, 0xE5,
+	0x26, 0x28, 0x23, 0xA1, 0x8A, 0xA4, 0x29, 0xEA, 0xFD, 0x9E,
+	0x39, 0x60, 0x8A, 0xF3, 0xB5, 0xA6, 0xEB, 0x3F, 0x02, 0xB6,
+	0x16, 0xC3, 0x96, 0x9D, 0x38, 0xB0, 0x7D, 0x82, 0x87, 0x0C,
+	0xF7, 0xBE, 0x24, 0xE5, 0x5F, 0x41, 0x04, 0x79, 0x76, 0x40,
+	0xE7, 0x00, 0x22, 0x7E, 0xB5, 0x85, 0x7F, 0x8D, 0x01, 0x50,
+	0xE9, 0xD3, 0x29, 0x42, 0x08, 0xB3, 0x51, 0x40, 0x7B, 0xD7,
+	0x8D, 0xCC, 0x10, 0x01, 0x64, 0x59, 0x28, 0xB6, 0x53, 0xF3,
+	0x50, 0x4E, 0xB1, 0xF2, 0x58, 0xCD, 0x6E, 0xF5, 0x56, 0x3E,
+	0x66, 0x2F, 0xD7, 0x07, 0x7F, 0x52, 0x4C, 0x13, 0x24, 0xDC,
+	0x8E, 0x8D, 0xCC, 0xED, 0x77, 0xC4, 0x21, 0xD2, 0xFD, 0x08,
+	0xEA, 0xD7, 0xC0, 0x5C, 0x13, 0x82, 0x81, 0x31, 0x2F, 0x2B,
+	0x08, 0xE4, 0x80, 0x04, 0x7A, 0x0C, 0x8A, 0x3C, 0xDC, 0x22,
+	0xE4, 0x5A, 0x7A, 0xB0, 0x12, 0x5E, 0x4A, 0x76, 0x94, 0x77,
+	0xC2, 0x0E, 0x92, 0xBA, 0x8A, 0xA0, 0x1F, 0x14, 0x51, 0x1E,
+	0x66, 0x6C, 0x38, 0x03, 0x6C, 0xC7, 0x4A, 0x4B, 0x70, 0x80,
+	0xAF, 0xCA, 0x84, 0x51, 0xD8, 0xD2, 0x26, 0x49, 0xF5, 0xA8,
+	0x5E, 0x35, 0x4B, 0xAC, 0xCE, 0x29, 0x92, 0x33, 0xB7, 0xA2,
+	0x69, 0x7D, 0x0C, 0xE0, 0x9C, 0xDB, 0x04, 0xD6, 0xB4, 0xBC,
+	0x39, 0xD7, 0x7F, 0x9E, 0x9D, 0x78, 0x38, 0x7F, 0x51, 0x54,
+	0x50, 0x8B, 0x9E, 0x9C, 0x03, 0x6C, 0xF5, 0x9D, 0x2C, 0x74,
+	0x57, 0xF0, 0x27, 0x2A, 0xC3, 0x47, 0xCA, 0xB9, 0xD7, 0x5C,
+	0xFF, 0xC2, 0xAC, 0x65, 0x4E, 0xBD
+};
+
+/*
+ * We need temporary values for at least 7 integers of the same size
+ * as a factor (including header word); more space helps with performance
+ * (in modular exponentiations), but we much prefer to remain under
+ * 2 kilobytes in total, to save stack space. The macro TEMPS below
+ * exceeds 512 (which is a count in 32-bit words) when BR_MAX_RSA_SIZE
+ * is greater than 4464 (default value is 4096, so the 2-kB limit is
+ * maintained unless BR_MAX_RSA_SIZE was modified).
+ */
+#define MAX(x, y)   ((x) > (y) ? (x) : (y))
+#define ROUND2(x)   ((((x) + 1) >> 1) << 1)
+
+#define TEMPS   MAX(512, ROUND2(7 * ((((BR_MAX_RSA_SIZE + 1) >> 1) + 61) / 31)))
+
+/*
+ * Perform trial division on a candidate prime. This computes
+ * y = SMALL_PRIMES mod x, then tries to compute y/y mod x. The
+ * br_i31_moddiv() function will report an error if y is not invertible
+ * modulo x. Returned value is 1 on success (none of the small primes
+ * divides x), 0 on error (a non-trivial GCD is obtained).
+ *
+ * This function assumes that x is odd.
+ */
+static uint32_t
+trial_divisions(const uint32_t *x, uint32_t *t)
+{
+	uint32_t *y;
+	uint32_t x0i;
+
+	y = t;
+	t += 1 + ((x[0] + 31) >> 5);
+	x0i = br_i31_ninv31(x[1]);
+	br_i31_decode_reduce(y, SMALL_PRIMES, sizeof SMALL_PRIMES, x);
+	return br_i31_moddiv(y, y, x, x0i, t);
+}
+
+/*
+ * Perform n rounds of Miller-Rabin on the candidate prime x. This
+ * function assumes that x = 3 mod 4.
+ *
+ * Returned value is 1 on success (all rounds completed successfully),
+ * 0 otherwise.
+ */
+static uint32_t
+miller_rabin(const br_prng_class **rng, const uint32_t *x, int n,
+	uint32_t *t, size_t tlen, br_i31_modpow_opt_type mp31)
+{
+	/*
+	 * Since x = 3 mod 4, the Miller-Rabin test is simple:
+	 *  - get a random base a (such that 1 < a < x-1)
+	 *  - compute z = a^((x-1)/2) mod x
+	 *  - if z != 1 and z != x-1, the number x is composite
+	 *
+	 * We generate bases 'a' randomly with a size which is
+	 * one bit less than x, which ensures that a < x-1. It
+	 * is not useful to verify that a > 1 because the probability
+	 * that we get a value a equal to 0 or 1 is much smaller
+	 * than the probability of our Miller-Rabin tests not to
+	 * detect a composite, which is already quite smaller than the
+	 * probability of the hardware misbehaving and return a
+	 * composite integer because of some glitch (e.g. bad RAM
+	 * or ill-timed cosmic ray).
+	 */
+	unsigned char *xm1d2;
+	size_t xlen, xm1d2_len, xm1d2_len_u32, u;
+	uint32_t asize;
+	unsigned cc;
+	uint32_t x0i;
+
+	/*
+	 * Compute (x-1)/2 (encoded).
+	 */
+	xm1d2 = (unsigned char *)t;
+	xm1d2_len = ((x[0] - (x[0] >> 5)) + 7) >> 3;
+	br_i31_encode(xm1d2, xm1d2_len, x);
+	cc = 0;
+	for (u = 0; u < xm1d2_len; u ++) {
+		unsigned w;
+
+		w = xm1d2[u];
+		xm1d2[u] = (unsigned char)((w >> 1) | cc);
+		cc = w << 7;
+	}
+
+	/*
+	 * We used some words of the provided buffer for (x-1)/2.
+	 */
+	xm1d2_len_u32 = (xm1d2_len + 3) >> 2;
+	t += xm1d2_len_u32;
+	tlen -= xm1d2_len_u32;
+
+	xlen = (x[0] + 31) >> 5;
+	asize = x[0] - 1 - EQ0(x[0] & 31);
+	x0i = br_i31_ninv31(x[1]);
+	while (n -- > 0) {
+		uint32_t *a, *t2;
+		uint32_t eq1, eqm1;
+		size_t t2len;
+
+		/*
+		 * Generate a random base. We don't need the base to be
+		 * really uniform modulo x, so we just get a random
+		 * number which is one bit shorter than x.
+		 */
+		a = t;
+		a[0] = x[0];
+		a[xlen] = 0;
+		mkrand(rng, a, asize);
+
+		/*
+		 * Compute a^((x-1)/2) mod x. We assume here that the
+		 * function will not fail (the temporary array is large
+		 * enough).
+		 */
+		t2 = t + 1 + xlen;
+		t2len = tlen - 1 - xlen;
+		if ((t2len & 1) != 0) {
+			/*
+			 * Since the source array is 64-bit aligned and
+			 * has an even number of elements (TEMPS), we
+			 * can use the parity of the remaining length to
+			 * detect and adjust alignment.
+			 */
+			t2 ++;
+			t2len --;
+		}
+		mp31(a, xm1d2, xm1d2_len, x, x0i, t2, t2len);
+
+		/*
+		 * We must obtain either 1 or x-1. Note that x is odd,
+		 * hence x-1 differs from x only in its low word (no
+		 * carry).
+		 */
+		eq1 = a[1] ^ 1;
+		eqm1 = a[1] ^ (x[1] - 1);
+		for (u = 2; u <= xlen; u ++) {
+			eq1 |= a[u];
+			eqm1 |= a[u] ^ x[u];
+		}
+
+		if ((EQ0(eq1) | EQ0(eqm1)) == 0) {
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*
+ * Create a random prime of the provided size. 'size' is the _encoded_
+ * bit length. The two top bits and the two bottom bits are set to 1.
+ */
+static void
+mkprime(const br_prng_class **rng, uint32_t *x, uint32_t esize,
+	uint32_t pubexp, uint32_t *t, size_t tlen, br_i31_modpow_opt_type mp31)
+{
+	size_t len;
+
+	x[0] = esize;
+	len = (esize + 31) >> 5;
+	for (;;) {
+		size_t u;
+		uint32_t m3, m5, m7, m11;
+		int rounds, s7, s11;
+
+		/*
+		 * Generate random bits. We force the two top bits and the
+		 * two bottom bits to 1.
+		 */
+		mkrand(rng, x, esize);
+		if ((esize & 31) == 0) {
+			x[len] |= 0x60000000;
+		} else if ((esize & 31) == 1) {
+			x[len] |= 0x00000001;
+			x[len - 1] |= 0x40000000;
+		} else {
+			x[len] |= 0x00000003 << ((esize & 31) - 2);
+		}
+		x[1] |= 0x00000003;
+
+		/*
+		 * Trial division with low primes (3, 5, 7 and 11). We
+		 * use the following properties:
+		 *
+		 *   2^2 = 1 mod 3
+		 *   2^4 = 1 mod 5
+		 *   2^3 = 1 mod 7
+		 *   2^10 = 1 mod 11
+		 */
+		m3 = 0;
+		m5 = 0;
+		m7 = 0;
+		m11 = 0;
+		s7 = 0;
+		s11 = 0;
+		for (u = 0; u < len; u ++) {
+			uint32_t w, w3, w5, w7, w11;
+
+			w = x[1 + u];
+			w3 = (w & 0xFFFF) + (w >> 16);     /* max: 98302 */
+			w5 = (w & 0xFFFF) + (w >> 16);     /* max: 98302 */
+			w7 = (w & 0x7FFF) + (w >> 15);     /* max: 98302 */
+			w11 = (w & 0xFFFFF) + (w >> 20);   /* max: 1050622 */
+
+			m3 += w3 << (u & 1);
+			m3 = (m3 & 0xFF) + (m3 >> 8);      /* max: 1025 */
+
+			m5 += w5 << ((4 - u) & 3);
+			m5 = (m5 & 0xFFF) + (m5 >> 12);    /* max: 4479 */
+
+			m7 += w7 << s7;
+			m7 = (m7 & 0x1FF) + (m7 >> 9);     /* max: 1280 */
+			if (++ s7 == 3) {
+				s7 = 0;
+			}
+
+			m11 += w11 << s11;
+			if (++ s11 == 10) {
+				s11 = 0;
+			}
+			m11 = (m11 & 0x3FF) + (m11 >> 10); /* max: 526847 */
+		}
+
+		m3 = (m3 & 0x3F) + (m3 >> 6);      /* max: 78 */
+		m3 = (m3 & 0x0F) + (m3 >> 4);      /* max: 18 */
+		m3 = ((m3 * 43) >> 5) & 3;
+
+		m5 = (m5 & 0xFF) + (m5 >> 8);      /* max: 271 */
+		m5 = (m5 & 0x0F) + (m5 >> 4);      /* max: 31 */
+		m5 -= 20 & -GT(m5, 19);
+		m5 -= 10 & -GT(m5, 9);
+		m5 -= 5 & -GT(m5, 4);
+
+		m7 = (m7 & 0x3F) + (m7 >> 6);      /* max: 82 */
+		m7 = (m7 & 0x07) + (m7 >> 3);      /* max: 16 */
+		m7 = ((m7 * 147) >> 7) & 7;
+
+		/*
+		 * 2^5 = 32 = -1 mod 11.
+		 */
+		m11 = (m11 & 0x3FF) + (m11 >> 10);      /* max: 1536 */
+		m11 = (m11 & 0x3FF) + (m11 >> 10);      /* max: 1023 */
+		m11 = (m11 & 0x1F) + 33 - (m11 >> 5);   /* max: 64 */
+		m11 -= 44 & -GT(m11, 43);
+		m11 -= 22 & -GT(m11, 21);
+		m11 -= 11 & -GT(m11, 10);
+
+		/*
+		 * If any of these modulo is 0, then the candidate is
+		 * not prime. Also, if pubexp is 3, 5, 7 or 11, and the
+		 * corresponding modulus is 1, then the candidate must
+		 * be rejected, because we need e to be invertible
+		 * modulo p-1. We can use simple comparisons here
+		 * because they won't leak information on a candidate
+		 * that we keep, only on one that we reject (and is thus
+		 * not secret).
+		 */
+		if (m3 == 0 || m5 == 0 || m7 == 0 || m11 == 0) {
+			continue;
+		}
+		if ((pubexp == 3 && m3 == 1)
+			|| (pubexp == 5 && m5 == 5)
+			|| (pubexp == 7 && m5 == 7)
+			|| (pubexp == 11 && m5 == 11))
+		{
+			continue;
+		}
+
+		/*
+		 * More trial divisions.
+		 */
+		if (!trial_divisions(x, t)) {
+			continue;
+		}
+
+		/*
+		 * Miller-Rabin algorithm. Since we selected a random
+		 * integer, not a maliciously crafted integer, we can use
+		 * relatively few rounds to lower the risk of a false
+		 * positive (i.e. declaring prime a non-prime) under
+		 * 2^(-80). It is not useful to lower the probability much
+		 * below that, since that would be substantially below
+		 * the probability of the hardware misbehaving. Sufficient
+		 * numbers of rounds are extracted from the Handbook of
+		 * Applied Cryptography, note 4.49 (page 149).
+		 *
+		 * Since we work on the encoded size (esize), we need to
+		 * compare with encoded thresholds.
+		 */
+		if (esize < 309) {
+			rounds = 12;
+		} else if (esize < 464) {
+			rounds = 9;
+		} else if (esize < 670) {
+			rounds = 6;
+		} else if (esize < 877) {
+			rounds = 4;
+		} else if (esize < 1341) {
+			rounds = 3;
+		} else {
+			rounds = 2;
+		}
+
+		if (miller_rabin(rng, x, rounds, t, tlen, mp31)) {
+			return;
+		}
+	}
+}
+
+/*
+ * Let p be a prime (p > 2^33, p = 3 mod 4). Let m = (p-1)/2, provided
+ * as parameter (with announced bit length equal to that of p). This
+ * function computes d = 1/e mod p-1 (for an odd integer e). Returned
+ * value is 1 on success, 0 on error (an error is reported if e is not
+ * invertible modulo p-1).
+ *
+ * The temporary buffer (t) must have room for at least 4 integers of
+ * the size of p.
+ */
+static uint32_t
+invert_pubexp(uint32_t *d, const uint32_t *m, uint32_t e, uint32_t *t)
+{
+	uint32_t *f;
+	uint32_t r;
+
+	f = t;
+	t += 1 + ((m[0] + 31) >> 5);
+
+	/*
+	 * Compute d = 1/e mod m. Since p = 3 mod 4, m is odd.
+	 */
+	br_i31_zero(d, m[0]);
+	d[1] = 1;
+	br_i31_zero(f, m[0]);
+	f[1] = e & 0x7FFFFFFF;
+	f[2] = e >> 31;
+	r = br_i31_moddiv(d, f, m, br_i31_ninv31(m[1]), t);
+
+	/*
+	 * We really want d = 1/e mod p-1, with p = 2m. By the CRT,
+	 * the result is either the d we got, or d + m.
+	 *
+	 * Let's write e*d = 1 + k*m, for some integer k. Integers e
+	 * and m are odd. If d is odd, then e*d is odd, which implies
+	 * that k must be even; in that case, e*d = 1 + (k/2)*2m, and
+	 * thus d is already fine. Conversely, if d is even, then k
+	 * is odd, and we must add m to d in order to get the correct
+	 * result.
+	 */
+	br_i31_add(d, m, (uint32_t)(1 - (d[1] & 1)));
+
+	return r;
+}
+
+/*
+ * Swap two buffers in RAM. They must be disjoint.
+ */
+static void
+bufswap(void *b1, void *b2, size_t len)
+{
+	size_t u;
+	unsigned char *buf1, *buf2;
+
+	buf1 = b1;
+	buf2 = b2;
+	for (u = 0; u < len; u ++) {
+		unsigned w;
+
+		w = buf1[u];
+		buf1[u] = buf2[u];
+		buf2[u] = w;
+	}
+}
+
+/* see inner.h */
+uint32_t
+br_rsa_i31_keygen_inner(const br_prng_class **rng,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp, br_i31_modpow_opt_type mp31)
+{
+	uint32_t esize_p, esize_q;
+	size_t plen, qlen, tlen;
+	uint32_t *p, *q, *t;
+	union {
+		uint32_t t32[TEMPS];
+		uint64_t t64[TEMPS >> 1];  /* for 64-bit alignment */
+	} tmp;
+	uint32_t r;
+
+	if (size < BR_MIN_RSA_SIZE || size > BR_MAX_RSA_SIZE) {
+		return 0;
+	}
+	if (pubexp == 0) {
+		pubexp = 3;
+	} else if (pubexp == 1 || (pubexp & 1) == 0) {
+		return 0;
+	}
+
+	esize_p = (size + 1) >> 1;
+	esize_q = size - esize_p;
+	sk->n_bitlen = size;
+	sk->p = kbuf_priv;
+	sk->plen = (esize_p + 7) >> 3;
+	sk->q = sk->p + sk->plen;
+	sk->qlen = (esize_q + 7) >> 3;
+	sk->dp = sk->q + sk->qlen;
+	sk->dplen = sk->plen;
+	sk->dq = sk->dp + sk->dplen;
+	sk->dqlen = sk->qlen;
+	sk->iq = sk->dq + sk->dqlen;
+	sk->iqlen = sk->plen;
+
+	if (pk != NULL) {
+		pk->n = kbuf_pub;
+		pk->nlen = (size + 7) >> 3;
+		pk->e = pk->n + pk->nlen;
+		pk->elen = 4;
+		br_enc32be(pk->e, pubexp);
+		while (*pk->e == 0) {
+			pk->e ++;
+			pk->elen --;
+		}
+	}
+
+	/*
+	 * We now switch to encoded sizes.
+	 *
+	 * floor((x * 16913) / (2^19)) is equal to floor(x/31) for all
+	 * integers x from 0 to 34966; the intermediate product fits on
+	 * 30 bits, thus we can use MUL31().
+	 */
+	esize_p += MUL31(esize_p, 16913) >> 19;
+	esize_q += MUL31(esize_q, 16913) >> 19;
+	plen = (esize_p + 31) >> 5;
+	qlen = (esize_q + 31) >> 5;
+	p = tmp.t32;
+	q = p + 1 + plen;
+	t = q + 1 + qlen;
+	tlen = ((sizeof tmp.t32) / sizeof(uint32_t)) - (2 + plen + qlen);
+
+	/*
+	 * When looking for primes p and q, we temporarily divide
+	 * candidates by 2, in order to compute the inverse of the
+	 * public exponent.
+	 */
+
+	for (;;) {
+		mkprime(rng, p, esize_p, pubexp, t, tlen, mp31);
+		br_i31_rshift(p, 1);
+		if (invert_pubexp(t, p, pubexp, t + 1 + plen)) {
+			br_i31_add(p, p, 1);
+			p[1] |= 1;
+			br_i31_encode(sk->p, sk->plen, p);
+			br_i31_encode(sk->dp, sk->dplen, t);
+			break;
+		}
+	}
+
+	for (;;) {
+		mkprime(rng, q, esize_q, pubexp, t, tlen, mp31);
+		br_i31_rshift(q, 1);
+		if (invert_pubexp(t, q, pubexp, t + 1 + qlen)) {
+			br_i31_add(q, q, 1);
+			q[1] |= 1;
+			br_i31_encode(sk->q, sk->qlen, q);
+			br_i31_encode(sk->dq, sk->dqlen, t);
+			break;
+		}
+	}
+
+	/*
+	 * If p and q have the same size, then it is possible that q > p
+	 * (when the target modulus size is odd, we generate p with a
+	 * greater bit length than q). If q > p, we want to swap p and q
+	 * (and also dp and dq) for two reasons:
+	 *  - The final step below (inversion of q modulo p) is easier if
+	 *    p > q.
+	 *  - While BearSSL's RSA code is perfectly happy with RSA keys such
+	 *    that p < q, some other implementations have restrictions and
+	 *    require p > q.
+	 *
+	 * Note that we can do a simple non-constant-time swap here,
+	 * because the only information we leak here is that we insist on
+	 * returning p and q such that p > q, which is not a secret.
+	 */
+	if (esize_p == esize_q && br_i31_sub(p, q, 0) == 1) {
+		bufswap(p, q, (1 + plen) * sizeof *p);
+		bufswap(sk->p, sk->q, sk->plen);
+		bufswap(sk->dp, sk->dq, sk->dplen);
+	}
+
+	/*
+	 * We have produced p, q, dp and dq. We can now compute iq = 1/d mod p.
+	 *
+	 * We ensured that p >= q, so this is just a matter of updating the
+	 * header word for q (and possibly adding an extra word).
+	 *
+	 * Theoretically, the call below may fail, in case we were
+	 * extraordinarily unlucky, and p = q. Another failure case is if
+	 * Miller-Rabin failed us _twice_, and p and q are non-prime and
+	 * have a factor is common. We report the error mostly because it
+	 * is cheap and we can, but in practice this never happens (or, at
+	 * least, it happens way less often than hardware glitches).
+	 */
+	q[0] = p[0];
+	if (plen > qlen) {
+		q[plen] = 0;
+		t ++;
+		tlen --;
+	}
+	br_i31_zero(t, p[0]);
+	t[1] = 1;
+	r = br_i31_moddiv(t, q, p, br_i31_ninv31(p[1]), t + 1 + plen);
+	br_i31_encode(sk->iq, sk->iqlen, t);
+
+	/*
+	 * Compute the public modulus too, if required.
+	 */
+	if (pk != NULL) {
+		br_i31_zero(t, p[0]);
+		br_i31_mulacc(t, p, q);
+		br_i31_encode(pk->n, pk->nlen, t);
+	}
+
+	return r;
+}
diff --git a/src/rsa/rsa_i62_keygen.c b/src/rsa/rsa_i62_keygen.c
new file mode 100644
index 0000000..7d60af9
--- /dev/null
+++ b/src/rsa/rsa_i62_keygen.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+/* see bearssl_rsa.h */
+uint32_t
+br_rsa_i62_keygen(const br_prng_class **rng,
+	br_rsa_private_key *sk, unsigned char *kbuf_priv,
+	br_rsa_public_key *pk, unsigned char *kbuf_pub,
+	unsigned size, uint32_t pubexp)
+{
+	return br_rsa_i31_keygen_inner(rng,
+		sk, kbuf_priv, pk, kbuf_pub, size, pubexp,
+		&br_i62_modpow_opt_as_i31);
+}
+
+/* see bearssl_rsa.h */
+br_rsa_keygen
+br_rsa_i62_keygen_get()
+{
+	return &br_rsa_i62_keygen;
+}
+
+#else
+
+/* see bearssl_rsa.h */
+br_rsa_keygen
+br_rsa_i62_keygen_get()
+{
+	return 0;
+}
+
+#endif
diff --git a/test/test_crypto.c b/test/test_crypto.c
index acefde1..acfabd0 100644
--- a/test/test_crypto.c
+++ b/test/test_crypto.c
@@ -5685,6 +5685,122 @@ test_RSA_OAEP(const char *name,
 	fflush(stdout);
 }
 
+static void
+test_RSA_keygen(const char *name, br_rsa_keygen kg,
+	br_rsa_pkcs1_sign sign, br_rsa_pkcs1_vrfy vrfy)
+{
+	br_hmac_drbg_context rng;
+	int i;
+
+	printf("Test %s: ", name);
+	fflush(stdout);
+
+	br_hmac_drbg_init(&rng, &br_sha256_vtable, "seed for RSA keygen", 19);
+
+	for (i = 0; i < 40; i ++) {
+		unsigned size;
+		uint32_t pubexp;
+		br_rsa_private_key sk;
+		br_rsa_public_key pk;
+		unsigned char kbuf_priv[BR_RSA_KBUF_PRIV_SIZE(2048)];
+		unsigned char kbuf_pub[BR_RSA_KBUF_PUB_SIZE(2048)];
+		uint32_t mod[256];
+		uint32_t cc;
+		size_t u, v;
+		unsigned char sig[257], hv[32], hv2[sizeof hv];
+		unsigned mask1, mask2;
+
+		if (i <= 35) {
+			size = 1024 + i;
+			pubexp = 17;
+		} else {
+			size = 2048;
+			pubexp = (i << 1) - 69;
+		}
+
+		if (!kg(&rng.vtable,
+			&sk, kbuf_priv, &pk, kbuf_pub, size, pubexp))
+		{
+			fprintf(stderr, "RSA key pair generation failure\n");
+			exit(EXIT_FAILURE);
+		}
+
+		for (u = pk.elen; u > 0; u --) {
+			if (pk.e[u - 1] != (pubexp & 0xFF)) {
+				fprintf(stderr, "wrong public exponent\n");
+				exit(EXIT_FAILURE);
+			}
+			pubexp >>= 8;
+		}
+		if (pubexp != 0) {
+			fprintf(stderr, "truncated public exponent\n");
+			exit(EXIT_FAILURE);
+		}
+
+		memset(mod, 0, sizeof mod);
+		for (u = 0; u < sk.plen; u ++) {
+			for (v = 0; v < sk.qlen; v ++) {
+				mod[u + v] += (uint32_t)sk.p[sk.plen - 1 - u]
+					* (uint32_t)sk.q[sk.qlen - 1 - v];
+			}
+		}
+		cc = 0;
+		for (u = 0; u < sk.plen + sk.qlen; u ++) {
+			mod[u] += cc;
+			cc = mod[u] >> 8;
+			mod[u] &= 0xFF;
+		}
+		for (u = 0; u < pk.nlen; u ++) {
+			if (mod[pk.nlen - 1 - u] != pk.n[u]) {
+				fprintf(stderr, "wrong modulus\n");
+				exit(EXIT_FAILURE);
+			}
+		}
+		if (sk.n_bitlen != size) {
+			fprintf(stderr, "wrong key size\n");
+			exit(EXIT_FAILURE);
+		}
+		if (pk.nlen != (size + 7) >> 3) {
+			fprintf(stderr, "wrong modulus size (bytes)\n");
+			exit(EXIT_FAILURE);
+		}
+		mask1 = 0x01 << ((size + 7) & 7);
+		mask2 = 0xFF & -mask1;
+		if ((pk.n[0] & mask2) != mask1) {
+			fprintf(stderr, "wrong modulus size (bits)\n");
+			exit(EXIT_FAILURE);
+		}
+
+		rng.vtable->generate(&rng.vtable, hv, sizeof hv);
+		memset(sig, 0, sizeof sig);
+		sig[pk.nlen] = 0x00;
+		if (!sign(BR_HASH_OID_SHA256, hv, sizeof hv, &sk, sig)) {
+			fprintf(stderr, "signature error\n");
+			exit(EXIT_FAILURE);
+		}
+		if (sig[pk.nlen] != 0x00) {
+			fprintf(stderr, "signature length error\n");
+			exit(EXIT_FAILURE);
+		}
+		if (!vrfy(sig, pk.nlen, BR_HASH_OID_SHA256, sizeof hv,
+			&pk, hv2))
+		{
+			fprintf(stderr, "signature verification error (1)\n");
+			exit(EXIT_FAILURE);
+		}
+		if (memcmp(hv, hv2, sizeof hv) != 0) {
+			fprintf(stderr, "signature verification error (2)\n");
+			exit(EXIT_FAILURE);
+		}
+
+		printf(".");
+		fflush(stdout);
+	}
+
+	printf(" done.\n");
+	fflush(stdout);
+}
+
 static void
 test_RSA_i15(void)
 {
@@ -5693,6 +5809,8 @@ test_RSA_i15(void)
 		&br_rsa_i15_pkcs1_sign, &br_rsa_i15_pkcs1_vrfy);
 	test_RSA_OAEP("RSA i15 OAEP",
 		&br_rsa_i15_oaep_encrypt, &br_rsa_i15_oaep_decrypt);
+	test_RSA_keygen("RSA i15 keygen", &br_rsa_i15_keygen,
+		&br_rsa_i15_pkcs1_sign, &br_rsa_i15_pkcs1_vrfy);
 }
 
 static void
@@ -5703,6 +5821,8 @@ test_RSA_i31(void)
 		&br_rsa_i31_pkcs1_sign, &br_rsa_i31_pkcs1_vrfy);
 	test_RSA_OAEP("RSA i31 OAEP",
 		&br_rsa_i31_oaep_encrypt, &br_rsa_i31_oaep_decrypt);
+	test_RSA_keygen("RSA i31 keygen", &br_rsa_i31_keygen,
+		&br_rsa_i31_pkcs1_sign, &br_rsa_i31_pkcs1_vrfy);
 }
 
 static void
@@ -5724,6 +5844,7 @@ test_RSA_i62(void)
 	br_rsa_pkcs1_vrfy vrfy;
 	br_rsa_oaep_encrypt menc;
 	br_rsa_oaep_decrypt mdec;
+	br_rsa_keygen kgen;
 
 	pub = br_rsa_i62_public_get();
 	priv = br_rsa_i62_private_get();
@@ -5731,16 +5852,18 @@ test_RSA_i62(void)
 	vrfy = br_rsa_i62_pkcs1_vrfy_get();
 	menc = br_rsa_i62_oaep_encrypt_get();
 	mdec = br_rsa_i62_oaep_decrypt_get();
+	kgen = br_rsa_i62_keygen_get();
 	if (pub) {
-		if (!priv || !sign || !vrfy || !menc || !mdec) {
+		if (!priv || !sign || !vrfy || !menc || !mdec || !kgen) {
 			fprintf(stderr, "Inconsistent i62 availability\n");
 			exit(EXIT_FAILURE);
 		}
 		test_RSA_core("RSA i62 core", pub, priv);
 		test_RSA_sign("RSA i62 sign", priv, sign, vrfy);
 		test_RSA_OAEP("RSA i62 OAEP", menc, mdec);
+		test_RSA_keygen("RSA i62 keygen", kgen, sign, vrfy);
 	} else {
-		if (priv || sign || vrfy || menc || mdec) {
+		if (priv || sign || vrfy || menc || mdec || kgen) {
 			fprintf(stderr, "Inconsistent i62 availability\n");
 			exit(EXIT_FAILURE);
 		}
diff --git a/test/test_speed.c b/test/test_speed.c
index 3ea9b99..43d062a 100644
--- a/test/test_speed.c
+++ b/test/test_speed.c
@@ -679,11 +679,12 @@ static const br_rsa_private_key RSA_SK = {
 
 static void
 test_speed_rsa_inner(char *name,
-	br_rsa_public fpub, br_rsa_private fpriv)
+	br_rsa_public fpub, br_rsa_private fpriv, br_rsa_keygen kgen)
 {
 	unsigned char tmp[sizeof RSA_N];
 	int i;
 	long num;
+	br_hmac_drbg_context rng;
 
 	memset(tmp, 'R', sizeof tmp);
 	tmp[0] = 0;
@@ -737,27 +738,82 @@ test_speed_rsa_inner(char *name,
 		}
 		num <<= 1;
 	}
+
+	if (kgen == 0) {
+		printf("%-30s KEYGEN UNAVAILABLE\n", name);
+		fflush(stdout);
+		return;
+	}
+	br_hmac_drbg_init(&rng, &br_sha256_vtable, "RSA keygen seed", 15);
+
+	num = 10;
+	for (;;) {
+		clock_t begin, end;
+		double tt;
+		long k;
+
+		begin = clock();
+		for (k = num; k > 0; k --) {
+			br_rsa_private_key sk;
+			unsigned char kbuf[BR_RSA_KBUF_PRIV_SIZE(1024)];
+
+			kgen(&rng.vtable, &sk, kbuf, NULL, NULL, 1024, 0);
+		}
+		end = clock();
+		tt = (double)(end - begin) / CLOCKS_PER_SEC;
+		if (tt >= 10.0) {
+			printf("%-30s %8.2f kgen[1024]/s\n", name,
+				(double)num / tt);
+			fflush(stdout);
+			break;
+		}
+		num <<= 1;
+	}
+
+	num = 10;
+	for (;;) {
+		clock_t begin, end;
+		double tt;
+		long k;
+
+		begin = clock();
+		for (k = num; k > 0; k --) {
+			br_rsa_private_key sk;
+			unsigned char kbuf[BR_RSA_KBUF_PRIV_SIZE(2048)];
+
+			kgen(&rng.vtable, &sk, kbuf, NULL, NULL, 2048, 0);
+		}
+		end = clock();
+		tt = (double)(end - begin) / CLOCKS_PER_SEC;
+		if (tt >= 10.0) {
+			printf("%-30s %8.2f kgen[2048]/s\n", name,
+				(double)num / tt);
+			fflush(stdout);
+			break;
+		}
+		num <<= 1;
+	}
 }
 
 static void
 test_speed_rsa_i15(void)
 {
 	test_speed_rsa_inner("RSA i15",
-		&br_rsa_i15_public, &br_rsa_i15_private);
+		&br_rsa_i15_public, &br_rsa_i15_private, &br_rsa_i15_keygen);
 }
 
 static void
 test_speed_rsa_i31(void)
 {
 	test_speed_rsa_inner("RSA i31",
-		&br_rsa_i31_public, &br_rsa_i31_private);
+		&br_rsa_i31_public, &br_rsa_i31_private, &br_rsa_i31_keygen);
 }
 
 static void
 test_speed_rsa_i32(void)
 {
 	test_speed_rsa_inner("RSA i32",
-		&br_rsa_i32_public, &br_rsa_i32_private);
+		&br_rsa_i32_public, &br_rsa_i32_private, 0);
 }
 
 static void
@@ -765,11 +821,13 @@ test_speed_rsa_i62(void)
 {
 	br_rsa_public pub;
 	br_rsa_private priv;
+	br_rsa_keygen kgen;
 
 	pub = br_rsa_i62_public_get();
 	priv = br_rsa_i62_private_get();
+	kgen = br_rsa_i62_keygen_get();
 	if (pub) {
-		test_speed_rsa_inner("RSA i62", pub, priv);
+		test_speed_rsa_inner("RSA i62", pub, priv, kgen);
 	} else {
 		printf("%-30s UNAVAILABLE\n", "RSA i62");
 	}
diff --git a/tools/skey.c b/tools/skey.c
index a9ecb32..0f672bf 100644
--- a/tools/skey.c
+++ b/tools/skey.c
@@ -105,6 +105,90 @@ print_ec(const br_ec_private_key *sk, int print_text, int print_C)
 	}
 }
 
+static int
+parse_rsa_spec(const char *kgen_spec, unsigned *size, uint32_t *pubexp)
+{
+	const char *p;
+	char *end;
+	unsigned long ul;
+
+	p = kgen_spec;
+	if (*p != 'r' && *p != 'R') {
+		return 0;
+	}
+	p ++;
+	if (*p != 's' && *p != 'S') {
+		return 0;
+	}
+	p ++;
+	if (*p != 'a' && *p != 'A') {
+		return 0;
+	}
+	p ++;
+	if (*p == 0) {
+		*size = 2048;
+		*pubexp = 3;
+		return 1;
+	} else if (*p != ':') {
+		return 0;
+	}
+	p ++;
+	ul = strtoul(p, &end, 10);
+	if (ul < 512 || ul > 32768) {
+		return 0;
+	}
+	*size = ul;
+	p = end;
+	if (*p == 0) {
+		*pubexp = 3;
+		return 1;
+	} else if (*p != ':') {
+		return 0;
+	}
+	p ++;
+	ul = strtoul(p, &end, 10);
+	if ((ul & 1) == 0 || ul == 1 || ((ul >> 30) >> 2) != 0) {
+		return 0;
+	}
+	*pubexp = ul;
+	if (*end != 0) {
+		return 0;
+	}
+	return 1;
+}
+
+static int
+keygen_rsa(unsigned size, uint32_t pubexp, int print_text, int print_C)
+{
+	br_hmac_drbg_context rng;
+	br_prng_seeder seeder;
+	br_rsa_keygen kg;
+	br_rsa_private_key sk;
+	unsigned char *kbuf_priv;
+	uint32_t r;
+
+	seeder = br_prng_seeder_system(NULL);
+	if (seeder == 0) {
+		fprintf(stderr, "ERROR: no system source of randomness\n");
+		return 0;
+	}
+	br_hmac_drbg_init(&rng, &br_sha256_vtable, NULL, 0);
+	if (!seeder(&rng.vtable)) {
+		fprintf(stderr, "ERROR: system source of randomness failed\n");
+		return 0;
+	}
+	kbuf_priv = xmalloc(BR_RSA_KBUF_PRIV_SIZE(size));
+	kg = br_rsa_keygen_get_default();
+	r = kg(&rng.vtable, &sk, kbuf_priv, NULL, NULL, size, pubexp);
+	if (!r) {
+		fprintf(stderr, "ERROR: RSA key pair generation failed\n");
+	} else {
+		print_rsa(&sk, print_text, print_C);
+	}
+	xfree(kbuf_priv);
+	return r;
+}
+
 static int
 decode_key(const unsigned char *buf, size_t len, int print_text, int print_C)
 {
@@ -165,6 +249,14 @@ usage_skey(void)
 "   -text         print public key details (human-readable)\n");
 	fprintf(stderr,
 "   -C            print public key details (C code)\n");
+	fprintf(stderr,
+"   -gen spec     generate a new key using the provided key specification\n");
+	fprintf(stderr,
+"Key specification begins with a key type, followed by optional parameters\n");
+	fprintf(stderr,
+"that depend on the key type, separated by colon characters:\n");
+	fprintf(stderr,
+"   rsa[:size[:pubexep]]   RSA key (defaults: size = 2048, pubexp = 3)\n");
 }
 
 /* see brssl.h */
@@ -178,6 +270,7 @@ do_skey(int argc, char *argv[])
 	unsigned char *buf;
 	size_t len;
 	pem_object *pos;
+	const char *kgen_spec;
 
 	retcode = 0;
 	verbose = 1;
@@ -186,6 +279,7 @@ do_skey(int argc, char *argv[])
 	num_files = 0;
 	buf = NULL;
 	pos = NULL;
+	kgen_spec = NULL;
 	for (i = 0; i < argc; i ++) {
 		const char *arg;
 
@@ -203,13 +297,48 @@ do_skey(int argc, char *argv[])
 			print_text = 1;
 		} else if (eqstr(arg, "-C")) {
 			print_C = 1;
+		} else if (eqstr(arg, "-gen")) {
+			if (++ i >= argc) {
+				fprintf(stderr,
+					"ERROR: no argument for '-gen'\n");
+				usage_skey();
+				goto skey_exit_error;
+			}
+			if (kgen_spec != NULL) {
+				fprintf(stderr,
+					"ERROR: multiple '-gen' options\n");
+				usage_skey();
+				goto skey_exit_error;
+			}
+			kgen_spec = argv[i];
+			argv[i] = NULL;
 		} else {
 			fprintf(stderr, "ERROR: unknown option: '%s'\n", arg);
 			usage_skey();
 			goto skey_exit_error;
 		}
 	}
-	if (num_files == 0) {
+	if (kgen_spec != NULL) {
+		unsigned rsa_size;
+		uint32_t rsa_pubexp;
+
+		if (num_files != 0) {
+			fprintf(stderr,
+				"ERROR: key files provided while generating\n");
+			usage_skey();
+			goto skey_exit_error;
+		}
+
+		if (parse_rsa_spec(kgen_spec, &rsa_size, &rsa_pubexp)) {
+			keygen_rsa(rsa_size, rsa_pubexp, print_text, print_C);
+		} else {
+			fprintf(stderr,
+				"ERROR: unknown key specification: '%s'\n",
+				kgen_spec);
+			usage_skey();
+			goto skey_exit_error;
+		}
+	} else if (num_files == 0) {
 		fprintf(stderr, "ERROR: no private key provided\n");
 		usage_skey();
 		goto skey_exit_error;
-- 
2.17.1