Initial commit.
[BoarSSL] / Crypto / Poly1305.cs
1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 using System;
26
27 namespace Crypto {
28
29 /*
30 * This class implements Poly1305, when used with ChaCha20. The
31 * ChaCha20 instance (already set with the secret key) is passed as
32 * parameter. Instances are not thread-safe.
33 */
34
35 public sealed class Poly1305 {
36
37 /*
38 * The ChaCha20 instance to use for encryption and decryption.
39 * That instance MUST be set, and it must have been initialised
40 * with the key to use.
41 */
42 public ChaCha20 ChaCha {
43 get; set;
44 }
45
46 byte[] pkey;
47 uint[] r;
48 uint[] acc;
49 byte[] foot;
50 byte[] tmp;
51
52 /*
53 * Create a new instance. The ChaCha20 instance to use MUST be
54 * set in the 'ChaCha' property.
55 */
56 public Poly1305()
57 {
58 pkey = new byte[32];
59 r = new uint[5];
60 acc = new uint[5];
61 foot = new byte[16];
62 tmp = new byte[16];
63 }
64
65 /*
66 * Run Poly1305 and ChaCha20 on the provided elements.
67 *
68 * iv Nonce for ChaCha20, exactly 12 bytes
69 * data data to encrypt or decrypt (buffer, off + len)
70 * aad additional authenticated data (buffer, offAAD + lenAAD)
71 * tag destination for computed authentication tag
72 * encrypt true to encrypt, false to decrypt
73 */
74 public void Run(byte[] iv,
75 byte[] data, int off, int len,
76 byte[] aad, int offAAD, int lenAAD,
77 byte[] tag, bool encrypt)
78 {
79 /*
80 * Compute Poly1305 key.
81 */
82 for (int i = 0; i < pkey.Length; i ++) {
83 pkey[i] = 0;
84 }
85 ChaCha.Run(iv, 0, pkey);
86
87 /*
88 * If encrypting, ChaCha20 must run first (the MAC is
89 * computed on the ciphertext).
90 */
91 if (encrypt) {
92 ChaCha.Run(iv, 1, data, off, len);
93 }
94
95 /*
96 * Decode the 'r' value into 26-bit words, with the
97 * "clamping" operation applied.
98 */
99 r[0] = Dec32le(pkey, 0) & 0x03FFFFFF;
100 r[1] = (Dec32le(pkey, 3) >> 2) & 0x03FFFF03;
101 r[2] = (Dec32le(pkey, 6) >> 4) & 0x03FFC0FF;
102 r[3] = (Dec32le(pkey, 9) >> 6) & 0x03F03FFF;
103 r[4] = (Dec32le(pkey, 12) >> 8) & 0x000FFFFF;
104
105 /*
106 * Accumulator is 0.
107 */
108 acc[0] = 0;
109 acc[1] = 0;
110 acc[2] = 0;
111 acc[3] = 0;
112 acc[4] = 0;
113
114 /*
115 * Process AAD, ciphertext and footer.
116 */
117 Enc32le(lenAAD, foot, 0);
118 Enc32le(0, foot, 4);
119 Enc32le(len, foot, 8);
120 Enc32le(0, foot, 12);
121 RunInner(aad, offAAD, lenAAD);
122 RunInner(data, off, len);
123 RunInner(foot, 0, 16);
124
125 /*
126 * Finalize modular reduction. The output of RunInner() is
127 * already mostly reduced: only acc[1] may be (very slightly)
128 * above 2^26. Thus, we only need one loop, back to acc[1].
129 */
130 uint cc = 0;
131 for (int i = 1; i <= 6; i ++) {
132 int j;
133
134 j = (i >= 5) ? i - 5 : i;
135 acc[j] += cc;
136 cc = acc[j] >> 26;
137 acc[j] &= 0x03FFFFFF;
138 }
139
140 /*
141 * The final value may still be in the 2^130-5..2^130-1
142 * range, in which case an additional subtraction must be
143 * performed, with constant-time code.
144 */
145 cc = (uint)((int)(0x03FFFFFA - acc[0]) >> 31);
146 for (int i = 1; i < 5; i ++) {
147 int z = (int)(acc[i] - 0x03FFFFFF);
148 cc &= ~(uint)((z | -z) >> 31);
149 }
150 cc &= 5;
151 for (int i = 0; i < 5; i ++) {
152 uint t = acc[i] + cc;
153 cc = t >> 26;
154 acc[i] = t & 0x03FFFFFF;
155 }
156
157 /*
158 * The tag is the sum of the 's' value (second half of
159 * the pkey[] array, little-endian encoding) and the
160 * current accumulator value. This addition is done modulo
161 * 2^128, i.e. with a simple truncation.
162 */
163 cc = 0;
164 uint aw = 0;
165 int awLen = 0;
166 for (int i = 0, j = 0; i < 16; i ++) {
167 if (awLen < 8) {
168 /*
169 * We "refill" our running byte buffer with
170 * a new extra accumulator word. Note that
171 * 'awLen' is always even, so at this point
172 * it must be 6 or less; since accumulator
173 * words fit on 32 bits, the operation
174 * below does not lose any bit.
175 */
176 aw |= acc[j ++] << awLen;
177 awLen += 26;
178 }
179 uint tb = (aw & 0xFF) + cc + pkey[16 + i];
180 aw >>= 8;
181 awLen -= 8;
182 tag[i] = (byte)tb;
183 cc = tb >> 8;
184 }
185
186 /*
187 * If decrypting, then we still have the ciphertext at
188 * this point, and we must perform the decryption.
189 */
190 if (!encrypt) {
191 ChaCha.Run(iv, 1, data, off, len);
192 }
193 }
194
195 /*
196 * Inner processing of the provided data. The accumulator and 'r'
197 * value are set in the instance fields, and must be updated.
198 * All accumulator words fit on 26 bits each, except the second
199 * (acc[1]) which may be very slightly above 2^26.
200 */
201 void RunInner(byte[] data, int off, int len)
202 {
203 /*
204 * Implementation is inspired from the public-domain code
205 * available there:
206 * https://github.com/floodyberry/poly1305-donna
207 */
208 uint r0 = r[0];
209 uint r1 = r[1];
210 uint r2 = r[2];
211 uint r3 = r[3];
212 uint r4 = r[4];
213
214 uint u1 = r1 * 5;
215 uint u2 = r2 * 5;
216 uint u3 = r3 * 5;
217 uint u4 = r4 * 5;
218
219 uint a0 = acc[0];
220 uint a1 = acc[1];
221 uint a2 = acc[2];
222 uint a3 = acc[3];
223 uint a4 = acc[4];
224
225 while (len > 0) {
226 if (len < 16) {
227 Array.Copy(data, off, tmp, 0, len);
228 for (int i = len; i < 16; i ++) {
229 tmp[i] = 0;
230 }
231 data = tmp;
232 off = 0;
233 }
234
235 /*
236 * Decode next block, with the "high bit" applied,
237 * and add that value to the accumulator.
238 */
239 a0 += Dec32le(data, off) & 0x03FFFFFF;
240 a1 += (Dec32le(data, off + 3) >> 2) & 0x03FFFFFF;
241 a2 += (Dec32le(data, off + 6) >> 4) & 0x03FFFFFF;
242 a3 += (Dec32le(data, off + 9) >> 6) & 0x03FFFFFF;
243 a4 += (Dec32le(data, off + 12) >> 8) | 0x01000000;
244
245 /*
246 * Compute multiplication. All elementary
247 * multiplications are 32x32->64.
248 */
249 ulong w0 = (ulong)a0 * r0
250 + (ulong)a1 * u4
251 + (ulong)a2 * u3
252 + (ulong)a3 * u2
253 + (ulong)a4 * u1;
254 ulong w1 = (ulong)a0 * r1
255 + (ulong)a1 * r0
256 + (ulong)a2 * u4
257 + (ulong)a3 * u3
258 + (ulong)a4 * u2;
259 ulong w2 = (ulong)a0 * r2
260 + (ulong)a1 * r1
261 + (ulong)a2 * r0
262 + (ulong)a3 * u4
263 + (ulong)a4 * u3;
264 ulong w3 = (ulong)a0 * r3
265 + (ulong)a1 * r2
266 + (ulong)a2 * r1
267 + (ulong)a3 * r0
268 + (ulong)a4 * u4;
269 ulong w4 = (ulong)a0 * r4
270 + (ulong)a1 * r3
271 + (ulong)a2 * r2
272 + (ulong)a3 * r1
273 + (ulong)a4 * r0;
274
275 /*
276 * Most of the modular reduction was done by using
277 * the 'u*' multipliers. We still need to do some
278 * carry propagation.
279 */
280 ulong c;
281 c = w0 >> 26;
282 a0 = (uint)w0 & 0x03FFFFFF;
283 w1 += c;
284 c = w1 >> 26;
285 a1 = (uint)w1 & 0x03FFFFFF;
286 w2 += c;
287 c = w2 >> 26;
288 a2 = (uint)w2 & 0x03FFFFFF;
289 w3 += c;
290 c = w3 >> 26;
291 a3 = (uint)w3 & 0x03FFFFFF;
292 w4 += c;
293 c = w4 >> 26;
294 a4 = (uint)w4 & 0x03FFFFFF;
295 a0 += (uint)c * 5;
296 a1 += a0 >> 26;
297 a0 &= 0x03FFFFFF;
298
299 off += 16;
300 len -= 16;
301 }
302
303 acc[0] = a0;
304 acc[1] = a1;
305 acc[2] = a2;
306 acc[3] = a3;
307 acc[4] = a4;
308 }
309
310 static uint Dec32le(byte[] buf, int off)
311 {
312 return (uint)buf[off]
313 | ((uint)buf[off + 1] << 8)
314 | ((uint)buf[off + 2] << 16)
315 | ((uint)buf[off + 3] << 24);
316 }
317
318 static void Enc32le(int x, byte[] buf, int off)
319 {
320 buf[off] = (byte)x;
321 buf[off + 1] = (byte)(x >> 8);
322 buf[off + 2] = (byte)(x >> 16);
323 buf[off + 3] = (byte)(x >> 24);
324 }
325 }
326
327 }