Botan  2.13.0
Crypto and TLS for C++11
aes_vperm.cpp
Go to the documentation of this file.
1 /*
2 * AES using vector permutes (SSSE3, NEON)
3 * (C) 2010,2016,2019 Jack Lloyd
4 *
5 * Based on public domain x86-64 assembly written by Mike Hamburg,
6 * described in "Accelerating AES with Vector Permute Instructions"
7 * (CHES 2009). His original code is available at
8 * https://crypto.stanford.edu/vpaes/
9 *
10 * Botan is released under the Simplified BSD License (see license.txt)
11 */
12 
13 #include <botan/aes.h>
14 #include <botan/internal/ct_utils.h>
15 #include <botan/internal/simd_32.h>
16 
17 #if defined(BOTAN_SIMD_USE_SSE2)
18  #include <tmmintrin.h>
19 #endif
20 
21 namespace Botan {
22 
23 namespace {
24 
25 inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) shuffle(SIMD_4x32 a, SIMD_4x32 b)
26  {
27 #if defined(BOTAN_SIMD_USE_SSE2)
28  return SIMD_4x32(_mm_shuffle_epi8(a.raw(), b.raw()));
29 #elif defined(BOTAN_SIMD_USE_NEON)
30  const uint8x16_t tbl = vreinterpretq_u8_u32(a.raw());
31  const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
32 
33 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
34  const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
35 
36  return SIMD_4x32(vreinterpretq_u32_u8(
37  vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)),
38  vtbl2_u8(tbl2, vget_high_u8(idx)))));
39 
40 #else
41  return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(tbl, idx)));
42 #endif
43 
44 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
45  __vector unsigned char bv = (__vector unsigned char)b.raw();
46 
47  const auto high_bit = vec_sl(vec_sr(bv, vec_splat_u8(7)), vec_splat_u8(4));
48  bv = vec_and(bv, vec_splat_u8(0x0F));
49  bv = vec_add(bv, high_bit);
50 
51  const __vector unsigned int zero = vec_splat_u32(0);
52  return SIMD_4x32(vec_perm(a.raw(), zero, bv));
53 #else
54  #error "No shuffle implementation available"
55 #endif
56  }
57 
58 inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) alignr8(SIMD_4x32 a, SIMD_4x32 b)
59  {
60 #if defined(BOTAN_SIMD_USE_SSE2)
61  return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
62 #elif defined(BOTAN_SIMD_USE_NEON)
63  return SIMD_4x32(vextq_u32(b.raw(), a.raw(), 2));
64 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
65  const __vector unsigned char mask = {8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
66  return SIMD_4x32(vec_perm(b.raw(), a.raw(), mask));
67 #else
68  #error "No alignr8 implementation available"
69 #endif
70  }
71 
72 const SIMD_4x32 k_ipt1 = SIMD_4x32(0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090);
73 const SIMD_4x32 k_ipt2 = SIMD_4x32(0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC);
74 
75 const SIMD_4x32 k_inv1 = SIMD_4x32(0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309);
76 const SIMD_4x32 k_inv2 = SIMD_4x32(0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C);
77 
78 const SIMD_4x32 sb1u = SIMD_4x32(0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E);
79 const SIMD_4x32 sb1t = SIMD_4x32(0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1);
80 
81 const SIMD_4x32 mc_forward[4] = {
82  SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
83  SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
84  SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
85  SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09)
86 };
87 
88 const SIMD_4x32 vperm_sr[4] = {
89  SIMD_4x32(0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C),
90  SIMD_4x32(0x0F0A0500, 0x030E0904, 0x07020D08, 0x0B06010C),
91  SIMD_4x32(0x0B020900, 0x0F060D04, 0x030A0108, 0x070E050C),
92  SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
93 };
94 
95 const SIMD_4x32 rcon[10] = {
96  SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
97  SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
98  SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
99  SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
100  SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
101  SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
102  SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
103  SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
104  SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
105  SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
106 };
107 
108 inline SIMD_4x32 low_nibs(SIMD_4x32 x)
109  {
110  const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
111  return lo_nibs_mask & x;
112  }
113 
114 inline SIMD_4x32 high_nibs(SIMD_4x32 x)
115  {
116  const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
117  return (hi_nibs_mask & x).shr<4>();
118  }
119 
120 inline SIMD_4x32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K)
121  {
122  return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
123  }
124 
125 inline SIMD_4x32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
126  {
127  const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
128  const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
129 
130  const SIMD_4x32 mc_backward[4] = {
131  SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
132  SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
133  SIMD_4x32(0x0A09080B, 0x0E0D0C0F, 0x02010003, 0x06050407),
134  SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
135  };
136 
137  const SIMD_4x32 Bh = high_nibs(B);
138  SIMD_4x32 Bl = low_nibs(B);
139  const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
140  Bl ^= Bh;
141 
142  const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
143  const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
144 
145  const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
146  const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
147 
148  return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
149  }
150 
151 inline SIMD_4x32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
152  {
153  const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
154  const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
155 
156  const SIMD_4x32 Bh = high_nibs(B);
157  SIMD_4x32 Bl = low_nibs(B);
158  const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
159  Bl ^= Bh;
160 
161  const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
162  const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
163 
164  return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, vperm_sr[r % 4]);
165  }
166 
167 inline SIMD_4x32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K)
168  {
169  const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
170  const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
171 
172  return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
173  }
174 
175 inline SIMD_4x32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
176  {
177  const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
178  const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
179 
180  const SIMD_4x32 sbeu = SIMD_4x32(0x26D4D000, 0x46F29296, 0x64B4F6B0, 0x22426004);
181  const SIMD_4x32 sbet = SIMD_4x32(0xFFAAC100, 0x0C55A6CD, 0x98593E32, 0x9467F36B);
182 
183  const SIMD_4x32 sbdu = SIMD_4x32(0xE6B1A200, 0x7D57CCDF, 0x882A4439, 0xF56E9B13);
184  const SIMD_4x32 sbdt = SIMD_4x32(0x24C6CB00, 0x3CE2FAF7, 0x15DEEFD3, 0x2931180D);
185 
186  const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
187  const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
188 
189  const SIMD_4x32 mcx[4] = {
190  SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
191  SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
192  SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
193  SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
194  };
195 
196  const SIMD_4x32 Bh = high_nibs(B);
197  B = low_nibs(B);
198  const SIMD_4x32 t2 = shuffle(k_inv2, B);
199 
200  B ^= Bh;
201 
202  const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
203  const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
204 
205  const SIMD_4x32 mc = mcx[(r-1)%4];
206 
207  const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
208  const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
209  const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
210  return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
211  }
212 
213 inline SIMD_4x32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
214  {
215  const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
216  const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
217 
218  const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
219 
220  const SIMD_4x32 Bh = high_nibs(B);
221  B = low_nibs(B);
222  const SIMD_4x32 t2 = shuffle(k_inv2, B);
223 
224  B ^= Bh;
225 
226  const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
227  const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
228 
229  const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K;
230  return shuffle(x, vperm_sr[which_sr]);
231  }
232 
233 void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
234  const SIMD_4x32 K[], size_t rounds)
235  {
236  CT::poison(in, blocks * 16);
237 
238  const size_t blocks2 = blocks - (blocks % 2);
239 
240  for(size_t i = 0; i != blocks2; i += 2)
241  {
242  SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
243  SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
244 
245  B0 = aes_enc_first_round(B0, K[0]);
246  B1 = aes_enc_first_round(B1, K[0]);
247 
248  for(size_t r = 1; r != rounds; ++r)
249  {
250  B0 = aes_enc_round(B0, K[r], r);
251  B1 = aes_enc_round(B1, K[r], r);
252  }
253 
254  B0 = aes_enc_last_round(B0, K[rounds], rounds);
255  B1 = aes_enc_last_round(B1, K[rounds], rounds);
256 
257  B0.store_le(out + i*16);
258  B1.store_le(out + (i+1)*16);
259  }
260 
261  for(size_t i = blocks2; i < blocks; ++i)
262  {
263  SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
264 
265  B = aes_enc_first_round(B, K[0]);
266 
267  for(size_t r = 1; r != rounds; ++r)
268  {
269  B = aes_enc_round(B, K[r], r);
270  }
271 
272  B = aes_enc_last_round(B, K[rounds], rounds);
273  B.store_le(out + i*16);
274  }
275 
276  CT::unpoison(in, blocks * 16);
277  CT::unpoison(out, blocks * 16);
278  }
279 
280 void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
281  const SIMD_4x32 K[], size_t rounds)
282  {
283  CT::poison(in, blocks * 16);
284 
285  const size_t blocks2 = blocks - (blocks % 2);
286 
287  for(size_t i = 0; i != blocks2; i += 2)
288  {
289  SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
290  SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
291 
292  B0 = aes_dec_first_round(B0, K[0]);
293  B1 = aes_dec_first_round(B1, K[0]);
294 
295  for(size_t r = 1; r != rounds; ++r)
296  {
297  B0 = aes_dec_round(B0, K[r], r);
298  B1 = aes_dec_round(B1, K[r], r);
299  }
300 
301  B0 = aes_dec_last_round(B0, K[rounds], rounds);
302  B1 = aes_dec_last_round(B1, K[rounds], rounds);
303 
304  B0.store_le(out + i*16);
305  B1.store_le(out + (i+1)*16);
306  }
307 
308  for(size_t i = blocks2; i < blocks; ++i)
309  {
310  SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
311 
312  B = aes_dec_first_round(B, K[0]);
313 
314  for(size_t r = 1; r != rounds; ++r)
315  {
316  B = aes_dec_round(B, K[r], r);
317  }
318 
319  B = aes_dec_last_round(B, K[rounds], rounds);
320  B.store_le(out + i*16);
321  }
322 
323  CT::unpoison(in, blocks * 16);
324  CT::unpoison(out, blocks * 16);
325  }
326 
327 }
328 
329 void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
330  {
331  const SIMD_4x32 K[11] = {
332  SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
333  SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
334  SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
335  SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]),
336  };
337 
338  return vperm_encrypt_blocks(in, out, blocks, K, 10);
339  }
340 
341 void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
342  {
343  const SIMD_4x32 K[11] = {
344  SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
345  SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
346  SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
347  SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]),
348  };
349 
350  return vperm_decrypt_blocks(in, out, blocks, K, 10);
351  }
352 
353 void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
354  {
355  const SIMD_4x32 K[13] = {
356  SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
357  SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
358  SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
359  SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
360  SIMD_4x32(&m_EK[4*12]),
361  };
362 
363  return vperm_encrypt_blocks(in, out, blocks, K, 12);
364  }
365 
366 void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
367  {
368  const SIMD_4x32 K[13] = {
369  SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
370  SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
371  SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
372  SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
373  SIMD_4x32(&m_DK[4*12]),
374  };
375 
376  return vperm_decrypt_blocks(in, out, blocks, K, 12);
377  }
378 
379 void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
380  {
381  const SIMD_4x32 K[15] = {
382  SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
383  SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
384  SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
385  SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
386  SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]),
387  };
388 
389  return vperm_encrypt_blocks(in, out, blocks, K, 14);
390  }
391 
392 void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
393  {
394  const SIMD_4x32 K[15] = {
395  SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
396  SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
397  SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
398  SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
399  SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]),
400  };
401 
402  return vperm_decrypt_blocks(in, out, blocks, K, 14);
403  }
404 
405 namespace {
406 
407 SIMD_4x32 aes_schedule_transform(SIMD_4x32 input,
408  SIMD_4x32 table_1,
409  SIMD_4x32 table_2)
410  {
411  return shuffle(table_1, low_nibs(input)) ^ shuffle(table_2, high_nibs(input));
412  }
413 
414 SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
415  {
416  const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
417 
418  SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
419  SIMD_4x32 t2 = t;
420  t = shuffle(t, mc_forward0);
421  t2 = t ^ t2 ^ shuffle(t, mc_forward0);
422  return shuffle(t2, vperm_sr[round_no % 4]);
423  }
424 
425 SIMD_4x32 aes_schedule_mangle_dec(SIMD_4x32 k, uint8_t round_no)
426  {
427  const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
428 
429  const SIMD_4x32 dsk[8] = {
430  SIMD_4x32(0x7ED9A700, 0xB6116FC8, 0x82255BFC, 0x4AED9334),
431  SIMD_4x32(0x27143300, 0x45765162, 0xE9DAFDCE, 0x8BB89FAC),
432  SIMD_4x32(0xCCA86400, 0x27438FEB, 0xADC90561, 0x4622EE8A),
433  SIMD_4x32(0x4F92DD00, 0x815C13CE, 0xBD602FF2, 0x73AEE13C),
434  SIMD_4x32(0x01C6C700, 0x03C4C502, 0xFA3D3CFB, 0xF83F3EF9),
435  SIMD_4x32(0x38CFF700, 0xEE1921D6, 0x7384BC4B, 0xA5526A9D),
436  SIMD_4x32(0x53732000, 0xE3C390B0, 0x10306343, 0xA080D3F3),
437  SIMD_4x32(0x036982E8, 0xA0CA214B, 0x8CE60D67, 0x2F45AEC4),
438  };
439 
440  SIMD_4x32 t = aes_schedule_transform(k, dsk[0], dsk[1]);
441  SIMD_4x32 output = shuffle(t, mc_forward0);
442 
443  t = aes_schedule_transform(t, dsk[2], dsk[3]);
444  output = shuffle(t ^ output, mc_forward0);
445 
446  t = aes_schedule_transform(t, dsk[4], dsk[5]);
447  output = shuffle(t ^ output, mc_forward0);
448 
449  t = aes_schedule_transform(t, dsk[6], dsk[7]);
450  output = shuffle(t ^ output, mc_forward0);
451 
452  return shuffle(output, vperm_sr[round_no % 4]);
453  }
454 
455 SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
456  {
457  const SIMD_4x32 out_tr1(0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121);
458  const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
459 
460  k = shuffle(k, vperm_sr[round_no % 4]);
461  k ^= SIMD_4x32::splat_u8(0x5B);
462  return aes_schedule_transform(k, out_tr1, out_tr2);
463  }
464 
465 SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
466  {
467  const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
468  const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
469 
470  k ^= SIMD_4x32::splat_u8(0x5B);
471  return aes_schedule_transform(k, deskew1, deskew2);
472  }
473 
474 SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
475  {
476  SIMD_4x32 smeared = input2 ^ input2.shift_elems_left<1>();
477  smeared ^= smeared.shift_elems_left<2>();
478  smeared ^= SIMD_4x32::splat_u8(0x5B);
479 
480  const SIMD_4x32 Bh = high_nibs(input1);
481  SIMD_4x32 Bl = low_nibs(input1);
482 
483  const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
484 
485  Bl ^= Bh;
486 
487  SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
488  SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
489 
490  return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
491  }
492 
493 SIMD_4x32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2)
494  {
495  // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
496  const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
497  return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
498  }
499 
500 SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
501  {
502  const SIMD_4x32 shuffle3332 =
503  SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
504  const SIMD_4x32 shuffle2000 =
505  SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
506 
507  const SIMD_4x32 zero_top_half(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
508  y &= zero_top_half;
509  return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
510  }
511 
512 }
513 
514 void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
515  {
516  m_EK.resize(11*4);
517  m_DK.resize(11*4);
518 
519  SIMD_4x32 key = SIMD_4x32::load_le(keyb);
520 
521  shuffle(key, vperm_sr[2]).store_le(&m_DK[4*10]);
522 
523  key = aes_schedule_transform(key, k_ipt1, k_ipt2);
524  key.store_le(&m_EK[0]);
525 
526  for(size_t i = 1; i != 10; ++i)
527  {
528  key = aes_schedule_round(rcon[i-1], key, key);
529 
530  aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
531 
532  aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
533  }
534 
535  key = aes_schedule_round(rcon[9], key, key);
536  aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
537  aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
538  }
539 
540 void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
541  {
542  m_EK.resize(13*4);
543  m_DK.resize(13*4);
544 
545  SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
546  SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
547 
548  shuffle(key1, vperm_sr[0]).store_le(&m_DK[12*4]);
549 
550  key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
551  key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
552 
553  key1.store_le(&m_EK[0]);
554 
555  for(size_t i = 0; i != 4; ++i)
556  {
557  // key2 with 8 high bytes masked off
558  SIMD_4x32 t = key2;
559  key2 = aes_schedule_round(rcon[2*i], key2, key1);
560  const SIMD_4x32 key2t = alignr8(key2, t);
561  aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
562  aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
563 
564  t = aes_schedule_192_smear(key2, t);
565 
566  aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
567  aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
568 
569  key2 = aes_schedule_round(rcon[2*i+1], t, key2);
570 
571  if(i == 3)
572  {
573  aes_schedule_mangle_last(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
574  aes_schedule_mangle_last_dec(key2).store_le(&m_DK[4*(9-3*i)]);
575  }
576  else
577  {
578  aes_schedule_mangle(key2, (i+1)%4).store_le(&m_EK[4*(3*i+3)]);
579  aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(9-3*i)]);
580  }
581 
582  key1 = key2;
583  key2 = aes_schedule_192_smear(key2, t);
584  }
585  }
586 
587 void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
588  {
589  m_EK.resize(15*4);
590  m_DK.resize(15*4);
591 
592  SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
593  SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
594 
595  shuffle(key1, vperm_sr[2]).store_le(&m_DK[4*14]);
596 
597  key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
598  key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
599 
600  key1.store_le(&m_EK[0]);
601  aes_schedule_mangle(key2, 3).store_le(&m_EK[4]);
602 
603  aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
604 
605  const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
606 
607  for(size_t i = 2; i != 14; i += 2)
608  {
609  const SIMD_4x32 k_t = key2;
610  key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1);
611 
612  aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
613  aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
614 
615  key2 = aes_schedule_round(shuffle(key2, shuffle3333), k_t);
616 
617  aes_schedule_mangle(key2, (i-1)%4).store_le(&m_EK[4*(i+1)]);
618  aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
619  }
620 
621  key2 = aes_schedule_round(rcon[6], key2, key1);
622 
623  aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
624  aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
625  }
626 
627 }
static SIMD_4x32 load_le(const void *in)
Definition: simd_32.h:159
void poison(const T *p, size_t n)
Definition: ct_utils.h:48
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:71
size_t high_bit(T n)
Definition: bit_ops.h:55
static SIMD_4x32 splat_u8(uint8_t B)
Definition: simd_32.h:144
Definition: alg_id.cpp:13
static SIMD_4x32 splat(uint32_t B)
Definition: simd_32.h:130
void unpoison(const T *p, size_t n)
Definition: ct_utils.h:59