8 #include <botan/threefish.h>
16 inline
void interleave_epi64(__m256i& X0, __m256i& X1)
21 const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22 const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
24 X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25 X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
29 inline
void deinterleave_epi64(__m256i& X0, __m256i& X1)
31 const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32 const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
34 X0 = _mm256_unpacklo_epi64(T0, T1);
35 X1 = _mm256_unpackhi_epi64(T0, T1);
39 inline
void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
66 __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67 __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68 __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
70 R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71 R1 = _mm256_blend_epi32(T2, T1, 0xC0);
78 void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
80 const uint64_t* K = &get_K()[0];
81 const uint64_t* T_64 = &get_T()[0];
83 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
84 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
85 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
86 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
87 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
88 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
89 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
90 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
92 #define THREEFISH_ROUND(X0, X1, SHL) \
94 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
95 X0 = _mm256_add_epi64(X0, X1); \
96 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
97 X1 = _mm256_xor_si256(X1, X0); \
98 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
99 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
102 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \
104 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
105 X0 = _mm256_add_epi64(X0, X1); \
106 X2 = _mm256_add_epi64(X2, X3); \
107 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
108 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
109 X1 = _mm256_xor_si256(X1, X0); \
110 X3 = _mm256_xor_si256(X3, X2); \
111 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
112 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \
113 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
114 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
117 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
119 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
120 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
121 X0 = _mm256_add_epi64(X0, K0); \
122 X1 = _mm256_add_epi64(X1, K1); \
123 X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \
124 X0 = _mm256_add_epi64(X0, T0); \
125 X1 = _mm256_add_epi64(X1, T1); \
128 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
130 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
131 __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
132 X0 = _mm256_add_epi64(X0, K0); \
133 X2 = _mm256_add_epi64(X2, K0); \
134 X1 = _mm256_add_epi64(X1, K1); \
135 X3 = _mm256_add_epi64(X3, K1); \
136 T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
137 X0 = _mm256_add_epi64(X0, T0); \
138 X2 = _mm256_add_epi64(X2, T0); \
139 X1 = _mm256_add_epi64(X1, T1); \
140 X3 = _mm256_add_epi64(X3, T1); \
143 #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \
145 rotate_keys(K1, K2, K0); \
146 THREEFISH_ROUND(X0, X1, ROTATE_1); \
147 THREEFISH_ROUND(X0, X1, ROTATE_2); \
148 THREEFISH_ROUND(X0, X1, ROTATE_3); \
149 THREEFISH_ROUND(X0, X1, ROTATE_4); \
150 THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \
152 THREEFISH_ROUND(X0, X1, ROTATE_5); \
153 THREEFISH_ROUND(X0, X1, ROTATE_6); \
154 THREEFISH_ROUND(X0, X1, ROTATE_7); \
155 THREEFISH_ROUND(X0, X1, ROTATE_8); \
156 THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \
159 #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
161 rotate_keys(K1, K2, K0); \
162 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
163 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
164 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
165 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
166 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \
168 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
169 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
170 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
171 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
172 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \
175 __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
176 __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
177 __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
179 const __m256i* in_mm =
reinterpret_cast<const __m256i*
>(in);
180 __m256i* out_mm =
reinterpret_cast<__m256i*
>(out);
184 __m256i X0 = _mm256_loadu_si256(in_mm++);
185 __m256i X1 = _mm256_loadu_si256(in_mm++);
186 __m256i X2 = _mm256_loadu_si256(in_mm++);
187 __m256i X3 = _mm256_loadu_si256(in_mm++);
189 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
191 interleave_epi64(X0, X1);
192 interleave_epi64(X2, X3);
196 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3);
197 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1);
198 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2);
199 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3);
200 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1);
201 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
202 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
203 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
204 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
206 deinterleave_epi64(X0, X1);
207 deinterleave_epi64(X2, X3);
209 _mm256_storeu_si256(out_mm++, X0);
210 _mm256_storeu_si256(out_mm++, X1);
211 _mm256_storeu_si256(out_mm++, X2);
212 _mm256_storeu_si256(out_mm++, X3);
217 for(
size_t i = 0; i != blocks; ++i)
219 __m256i X0 = _mm256_loadu_si256(in_mm++);
220 __m256i X1 = _mm256_loadu_si256(in_mm++);
222 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
224 interleave_epi64(X0, X1);
238 deinterleave_epi64(X0, X1);
240 _mm256_storeu_si256(out_mm++, X0);
241 _mm256_storeu_si256(out_mm++, X1);
244 #undef THREEFISH_ENC_8_ROUNDS
245 #undef THREEFISH_ROUND
246 #undef THREEFISH_INJECT_KEY
247 #undef THREEFISH_DEC_2_8_ROUNDS
248 #undef THREEFISH_ROUND_2
249 #undef THREEFISH_INJECT_KEY_2
253 void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks)
const
255 const uint64_t* K = &get_K()[0];
256 const uint64_t* T_64 = &get_T()[0];
258 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
259 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
260 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
261 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
262 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
263 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
264 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
265 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
267 #define THREEFISH_ROUND(X0, X1, SHR) \
269 const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
270 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
271 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
272 X1 = _mm256_xor_si256(X1, X0); \
273 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
274 X0 = _mm256_sub_epi64(X0, X1); \
277 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \
279 const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
280 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
281 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \
282 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
283 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
284 X1 = _mm256_xor_si256(X1, X0); \
285 X3 = _mm256_xor_si256(X3, X2); \
286 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
287 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
288 X0 = _mm256_sub_epi64(X0, X1); \
289 X2 = _mm256_sub_epi64(X2, X3); \
292 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
294 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
295 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
296 X0 = _mm256_sub_epi64(X0, K0); \
297 X1 = _mm256_sub_epi64(X1, K1); \
298 X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \
299 X0 = _mm256_sub_epi64(X0, T0); \
300 X1 = _mm256_sub_epi64(X1, T1); \
303 #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
305 THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \
306 THREEFISH_ROUND(X0, X1, ROTATE_8); \
307 THREEFISH_ROUND(X0, X1, ROTATE_7); \
308 THREEFISH_ROUND(X0, X1, ROTATE_6); \
309 THREEFISH_ROUND(X0, X1, ROTATE_5); \
311 THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \
312 THREEFISH_ROUND(X0, X1, ROTATE_4); \
313 THREEFISH_ROUND(X0, X1, ROTATE_3); \
314 THREEFISH_ROUND(X0, X1, ROTATE_2); \
315 THREEFISH_ROUND(X0, X1, ROTATE_1); \
318 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
320 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
321 __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
322 X0 = _mm256_sub_epi64(X0, K0); \
323 X2 = _mm256_sub_epi64(X2, K0); \
324 X1 = _mm256_sub_epi64(X1, K1); \
325 X3 = _mm256_sub_epi64(X3, K1); \
326 T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
327 X0 = _mm256_sub_epi64(X0, T0); \
328 X2 = _mm256_sub_epi64(X2, T0); \
329 X1 = _mm256_sub_epi64(X1, T1); \
330 X3 = _mm256_sub_epi64(X3, T1); \
333 #define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
335 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \
336 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
337 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
338 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
339 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
341 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
342 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
343 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
344 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
345 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
353 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
354 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
355 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
356 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
357 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
358 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
359 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
360 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
361 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
363 const __m256i* in_mm =
reinterpret_cast<const __m256i*
>(in);
364 __m256i* out_mm =
reinterpret_cast<__m256i*
>(out);
368 __m256i X0 = _mm256_loadu_si256(in_mm++);
369 __m256i X1 = _mm256_loadu_si256(in_mm++);
370 __m256i X2 = _mm256_loadu_si256(in_mm++);
371 __m256i X3 = _mm256_loadu_si256(in_mm++);
373 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
375 interleave_epi64(X0, X1);
376 interleave_epi64(X2, X3);
378 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
379 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
380 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
381 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
382 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
383 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
384 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
385 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
386 THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
390 deinterleave_epi64(X0, X1);
391 deinterleave_epi64(X2, X3);
393 _mm256_storeu_si256(out_mm++, X0);
394 _mm256_storeu_si256(out_mm++, X1);
395 _mm256_storeu_si256(out_mm++, X2);
396 _mm256_storeu_si256(out_mm++, X3);
401 for(
size_t i = 0; i != blocks; ++i)
403 __m256i X0 = _mm256_loadu_si256(in_mm++);
404 __m256i X1 = _mm256_loadu_si256(in_mm++);
406 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
408 interleave_epi64(X0, X1);
422 deinterleave_epi64(X0, X1);
424 _mm256_storeu_si256(out_mm++, X0);
425 _mm256_storeu_si256(out_mm++, X1);
428 #undef THREEFISH_DEC_8_ROUNDS
429 #undef THREEFISH_ROUND
430 #undef THREEFISH_INJECT_KEY
431 #undef THREEFISH_DEC_2_8_ROUNDS
432 #undef THREEFISH_ROUND_2
433 #undef THREEFISH_INJECT_KEY_2
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2)
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)
#define BOTAN_FUNC_ISA(isa)
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2)
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)