11 #include <botan/aes.h>
12 #include <botan/loadstor.h>
17 #define AES_ENC_4_ROUNDS(K) \
20 B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \
21 B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \
22 B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \
23 B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \
26 #define AES_ENC_4_LAST_ROUNDS(K, K2) \
29 B0 = veorq_u8(vaeseq_u8(B0, K), K2); \
30 B1 = veorq_u8(vaeseq_u8(B1, K), K2); \
31 B2 = veorq_u8(vaeseq_u8(B2, K), K2); \
32 B3 = veorq_u8(vaeseq_u8(B3, K), K2); \
35 #define AES_DEC_4_ROUNDS(K) \
38 B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \
39 B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \
40 B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \
41 B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \
44 #define AES_DEC_4_LAST_ROUNDS(K, K2) \
47 B0 = veorq_u8(vaesdq_u8(B0, K), K2); \
48 B1 = veorq_u8(vaesdq_u8(B1, K), K2); \
49 B2 = veorq_u8(vaesdq_u8(B2, K), K2); \
50 B3 = veorq_u8(vaesdq_u8(B3, K), K2); \
57 void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
59 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
60 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_ME.data());
62 const uint8x16_t K0 = vld1q_u8(skey + 0);
63 const uint8x16_t K1 = vld1q_u8(skey + 16);
64 const uint8x16_t K2 = vld1q_u8(skey + 32);
65 const uint8x16_t K3 = vld1q_u8(skey + 48);
66 const uint8x16_t K4 = vld1q_u8(skey + 64);
67 const uint8x16_t K5 = vld1q_u8(skey + 80);
68 const uint8x16_t K6 = vld1q_u8(skey + 96);
69 const uint8x16_t K7 = vld1q_u8(skey + 112);
70 const uint8x16_t K8 = vld1q_u8(skey + 128);
71 const uint8x16_t K9 = vld1q_u8(skey + 144);
72 const uint8x16_t K10 = vld1q_u8(mkey);
76 uint8x16_t B0 = vld1q_u8(in);
77 uint8x16_t B1 = vld1q_u8(in+16);
78 uint8x16_t B2 = vld1q_u8(in+32);
79 uint8x16_t B3 = vld1q_u8(in+48);
102 for(
size_t i = 0; i != blocks; ++i)
104 uint8x16_t B = vld1q_u8(in+16*i);
105 B = vaesmcq_u8(vaeseq_u8(B, K0));
106 B = vaesmcq_u8(vaeseq_u8(B, K1));
107 B = vaesmcq_u8(vaeseq_u8(B, K2));
108 B = vaesmcq_u8(vaeseq_u8(B, K3));
109 B = vaesmcq_u8(vaeseq_u8(B, K4));
110 B = vaesmcq_u8(vaeseq_u8(B, K5));
111 B = vaesmcq_u8(vaeseq_u8(B, K6));
112 B = vaesmcq_u8(vaeseq_u8(B, K7));
113 B = vaesmcq_u8(vaeseq_u8(B, K8));
114 B = veorq_u8(vaeseq_u8(B, K9), K10);
115 vst1q_u8(out+16*i, B);
123 void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
125 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
126 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_MD.data());
128 const uint8x16_t K0 = vld1q_u8(skey + 0);
129 const uint8x16_t K1 = vld1q_u8(skey + 16);
130 const uint8x16_t K2 = vld1q_u8(skey + 32);
131 const uint8x16_t K3 = vld1q_u8(skey + 48);
132 const uint8x16_t K4 = vld1q_u8(skey + 64);
133 const uint8x16_t K5 = vld1q_u8(skey + 80);
134 const uint8x16_t K6 = vld1q_u8(skey + 96);
135 const uint8x16_t K7 = vld1q_u8(skey + 112);
136 const uint8x16_t K8 = vld1q_u8(skey + 128);
137 const uint8x16_t K9 = vld1q_u8(skey + 144);
138 const uint8x16_t K10 = vld1q_u8(mkey);
142 uint8x16_t B0 = vld1q_u8(in);
143 uint8x16_t B1 = vld1q_u8(in+16);
144 uint8x16_t B2 = vld1q_u8(in+32);
145 uint8x16_t B3 = vld1q_u8(in+48);
159 vst1q_u8(out+16, B1);
160 vst1q_u8(out+32, B2);
161 vst1q_u8(out+48, B3);
168 for(
size_t i = 0; i != blocks; ++i)
170 uint8x16_t B = vld1q_u8(in+16*i);
171 B = vaesimcq_u8(vaesdq_u8(B, K0));
172 B = vaesimcq_u8(vaesdq_u8(B, K1));
173 B = vaesimcq_u8(vaesdq_u8(B, K2));
174 B = vaesimcq_u8(vaesdq_u8(B, K3));
175 B = vaesimcq_u8(vaesdq_u8(B, K4));
176 B = vaesimcq_u8(vaesdq_u8(B, K5));
177 B = vaesimcq_u8(vaesdq_u8(B, K6));
178 B = vaesimcq_u8(vaesdq_u8(B, K7));
179 B = vaesimcq_u8(vaesdq_u8(B, K8));
180 B = veorq_u8(vaesdq_u8(B, K9), K10);
181 vst1q_u8(out+16*i, B);
189 void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
191 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
192 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_ME.data());
194 const uint8x16_t K0 = vld1q_u8(skey + 0);
195 const uint8x16_t K1 = vld1q_u8(skey + 16);
196 const uint8x16_t K2 = vld1q_u8(skey + 32);
197 const uint8x16_t K3 = vld1q_u8(skey + 48);
198 const uint8x16_t K4 = vld1q_u8(skey + 64);
199 const uint8x16_t K5 = vld1q_u8(skey + 80);
200 const uint8x16_t K6 = vld1q_u8(skey + 96);
201 const uint8x16_t K7 = vld1q_u8(skey + 112);
202 const uint8x16_t K8 = vld1q_u8(skey + 128);
203 const uint8x16_t K9 = vld1q_u8(skey + 144);
204 const uint8x16_t K10 = vld1q_u8(skey + 160);
205 const uint8x16_t K11 = vld1q_u8(skey + 176);
206 const uint8x16_t K12 = vld1q_u8(mkey);
210 uint8x16_t B0 = vld1q_u8(in);
211 uint8x16_t B1 = vld1q_u8(in+16);
212 uint8x16_t B2 = vld1q_u8(in+32);
213 uint8x16_t B3 = vld1q_u8(in+48);
229 vst1q_u8(out+16, B1);
230 vst1q_u8(out+32, B2);
231 vst1q_u8(out+48, B3);
238 for(
size_t i = 0; i != blocks; ++i)
240 uint8x16_t B = vld1q_u8(in+16*i);
241 B = vaesmcq_u8(vaeseq_u8(B, K0));
242 B = vaesmcq_u8(vaeseq_u8(B, K1));
243 B = vaesmcq_u8(vaeseq_u8(B, K2));
244 B = vaesmcq_u8(vaeseq_u8(B, K3));
245 B = vaesmcq_u8(vaeseq_u8(B, K4));
246 B = vaesmcq_u8(vaeseq_u8(B, K5));
247 B = vaesmcq_u8(vaeseq_u8(B, K6));
248 B = vaesmcq_u8(vaeseq_u8(B, K7));
249 B = vaesmcq_u8(vaeseq_u8(B, K8));
250 B = vaesmcq_u8(vaeseq_u8(B, K9));
251 B = vaesmcq_u8(vaeseq_u8(B, K10));
252 B = veorq_u8(vaeseq_u8(B, K11), K12);
253 vst1q_u8(out+16*i, B);
261 void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
263 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
264 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_MD.data());
266 const uint8x16_t K0 = vld1q_u8(skey + 0);
267 const uint8x16_t K1 = vld1q_u8(skey + 16);
268 const uint8x16_t K2 = vld1q_u8(skey + 32);
269 const uint8x16_t K3 = vld1q_u8(skey + 48);
270 const uint8x16_t K4 = vld1q_u8(skey + 64);
271 const uint8x16_t K5 = vld1q_u8(skey + 80);
272 const uint8x16_t K6 = vld1q_u8(skey + 96);
273 const uint8x16_t K7 = vld1q_u8(skey + 112);
274 const uint8x16_t K8 = vld1q_u8(skey + 128);
275 const uint8x16_t K9 = vld1q_u8(skey + 144);
276 const uint8x16_t K10 = vld1q_u8(skey + 160);
277 const uint8x16_t K11 = vld1q_u8(skey + 176);
278 const uint8x16_t K12 = vld1q_u8(mkey);
282 uint8x16_t B0 = vld1q_u8(in);
283 uint8x16_t B1 = vld1q_u8(in+16);
284 uint8x16_t B2 = vld1q_u8(in+32);
285 uint8x16_t B3 = vld1q_u8(in+48);
301 vst1q_u8(out+16, B1);
302 vst1q_u8(out+32, B2);
303 vst1q_u8(out+48, B3);
310 for(
size_t i = 0; i != blocks; ++i)
312 uint8x16_t B = vld1q_u8(in+16*i);
313 B = vaesimcq_u8(vaesdq_u8(B, K0));
314 B = vaesimcq_u8(vaesdq_u8(B, K1));
315 B = vaesimcq_u8(vaesdq_u8(B, K2));
316 B = vaesimcq_u8(vaesdq_u8(B, K3));
317 B = vaesimcq_u8(vaesdq_u8(B, K4));
318 B = vaesimcq_u8(vaesdq_u8(B, K5));
319 B = vaesimcq_u8(vaesdq_u8(B, K6));
320 B = vaesimcq_u8(vaesdq_u8(B, K7));
321 B = vaesimcq_u8(vaesdq_u8(B, K8));
322 B = vaesimcq_u8(vaesdq_u8(B, K9));
323 B = vaesimcq_u8(vaesdq_u8(B, K10));
324 B = veorq_u8(vaesdq_u8(B, K11), K12);
325 vst1q_u8(out+16*i, B);
333 void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
335 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_EK.data());
336 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_ME.data());
338 const uint8x16_t K0 = vld1q_u8(skey + 0);
339 const uint8x16_t K1 = vld1q_u8(skey + 16);
340 const uint8x16_t K2 = vld1q_u8(skey + 32);
341 const uint8x16_t K3 = vld1q_u8(skey + 48);
342 const uint8x16_t K4 = vld1q_u8(skey + 64);
343 const uint8x16_t K5 = vld1q_u8(skey + 80);
344 const uint8x16_t K6 = vld1q_u8(skey + 96);
345 const uint8x16_t K7 = vld1q_u8(skey + 112);
346 const uint8x16_t K8 = vld1q_u8(skey + 128);
347 const uint8x16_t K9 = vld1q_u8(skey + 144);
348 const uint8x16_t K10 = vld1q_u8(skey + 160);
349 const uint8x16_t K11 = vld1q_u8(skey + 176);
350 const uint8x16_t K12 = vld1q_u8(skey + 192);
351 const uint8x16_t K13 = vld1q_u8(skey + 208);
352 const uint8x16_t K14 = vld1q_u8(mkey);
356 uint8x16_t B0 = vld1q_u8(in);
357 uint8x16_t B1 = vld1q_u8(in+16);
358 uint8x16_t B2 = vld1q_u8(in+32);
359 uint8x16_t B3 = vld1q_u8(in+48);
377 vst1q_u8(out+16, B1);
378 vst1q_u8(out+32, B2);
379 vst1q_u8(out+48, B3);
386 for(
size_t i = 0; i != blocks; ++i)
388 uint8x16_t B = vld1q_u8(in+16*i);
389 B = vaesmcq_u8(vaeseq_u8(B, K0));
390 B = vaesmcq_u8(vaeseq_u8(B, K1));
391 B = vaesmcq_u8(vaeseq_u8(B, K2));
392 B = vaesmcq_u8(vaeseq_u8(B, K3));
393 B = vaesmcq_u8(vaeseq_u8(B, K4));
394 B = vaesmcq_u8(vaeseq_u8(B, K5));
395 B = vaesmcq_u8(vaeseq_u8(B, K6));
396 B = vaesmcq_u8(vaeseq_u8(B, K7));
397 B = vaesmcq_u8(vaeseq_u8(B, K8));
398 B = vaesmcq_u8(vaeseq_u8(B, K9));
399 B = vaesmcq_u8(vaeseq_u8(B, K10));
400 B = vaesmcq_u8(vaeseq_u8(B, K11));
401 B = vaesmcq_u8(vaeseq_u8(B, K12));
402 B = veorq_u8(vaeseq_u8(B, K13), K14);
403 vst1q_u8(out+16*i, B);
411 void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
413 const uint8_t *skey =
reinterpret_cast<const uint8_t*
>(m_DK.data());
414 const uint8_t *mkey =
reinterpret_cast<const uint8_t*
>(m_MD.data());
416 const uint8x16_t K0 = vld1q_u8(skey + 0);
417 const uint8x16_t K1 = vld1q_u8(skey + 16);
418 const uint8x16_t K2 = vld1q_u8(skey + 32);
419 const uint8x16_t K3 = vld1q_u8(skey + 48);
420 const uint8x16_t K4 = vld1q_u8(skey + 64);
421 const uint8x16_t K5 = vld1q_u8(skey + 80);
422 const uint8x16_t K6 = vld1q_u8(skey + 96);
423 const uint8x16_t K7 = vld1q_u8(skey + 112);
424 const uint8x16_t K8 = vld1q_u8(skey + 128);
425 const uint8x16_t K9 = vld1q_u8(skey + 144);
426 const uint8x16_t K10 = vld1q_u8(skey + 160);
427 const uint8x16_t K11 = vld1q_u8(skey + 176);
428 const uint8x16_t K12 = vld1q_u8(skey + 192);
429 const uint8x16_t K13 = vld1q_u8(skey + 208);
430 const uint8x16_t K14 = vld1q_u8(mkey);
434 uint8x16_t B0 = vld1q_u8(in);
435 uint8x16_t B1 = vld1q_u8(in+16);
436 uint8x16_t B2 = vld1q_u8(in+32);
437 uint8x16_t B3 = vld1q_u8(in+48);
455 vst1q_u8(out+16, B1);
456 vst1q_u8(out+32, B2);
457 vst1q_u8(out+48, B3);
464 for(
size_t i = 0; i != blocks; ++i)
466 uint8x16_t B = vld1q_u8(in+16*i);
467 B = vaesimcq_u8(vaesdq_u8(B, K0));
468 B = vaesimcq_u8(vaesdq_u8(B, K1));
469 B = vaesimcq_u8(vaesdq_u8(B, K2));
470 B = vaesimcq_u8(vaesdq_u8(B, K3));
471 B = vaesimcq_u8(vaesdq_u8(B, K4));
472 B = vaesimcq_u8(vaesdq_u8(B, K5));
473 B = vaesimcq_u8(vaesdq_u8(B, K6));
474 B = vaesimcq_u8(vaesdq_u8(B, K7));
475 B = vaesimcq_u8(vaesdq_u8(B, K8));
476 B = vaesimcq_u8(vaesdq_u8(B, K9));
477 B = vaesimcq_u8(vaesdq_u8(B, K10));
478 B = vaesimcq_u8(vaesdq_u8(B, K11));
479 B = vaesimcq_u8(vaesdq_u8(B, K12));
480 B = veorq_u8(vaesdq_u8(B, K13), K14);
481 vst1q_u8(out+16*i, B);
485 #undef AES_ENC_4_ROUNDS
486 #undef AES_ENC_4_LAST_ROUNDS
487 #undef AES_DEC_4_ROUNDS
488 #undef AES_DEC_4_LAST_ROUNDS
#define BOTAN_FUNC_ISA(isa)
#define AES_DEC_4_ROUNDS(K)
#define AES_ENC_4_ROUNDS(K)
#define AES_ENC_4_LAST_ROUNDS(K, K2)
#define AES_DEC_4_LAST_ROUNDS(K, K2)