Botan  2.19.1
Crypto and TLS for C++11
threefish_512_avx2.cpp
Go to the documentation of this file.
1 /*
2 * Threefish-512 using AVX2
3 * (C) 2013,2016 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/threefish_512.h>
9 #include <immintrin.h>
10 
11 namespace Botan {
12 
13 namespace {
14 
15 BOTAN_FUNC_ISA("avx2")
16 inline void interleave_epi64(__m256i& X0, __m256i& X1)
17  {
18  // interleave X0 and X1 qwords
19  // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20 
21  const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22  const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23 
24  X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25  X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26  }
27 
28 BOTAN_FUNC_ISA("avx2")
29 inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30  {
31  const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32  const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33 
34  X0 = _mm256_unpacklo_epi64(T0, T1);
35  X1 = _mm256_unpackhi_epi64(T0, T1);
36  }
37 
38 BOTAN_FUNC_ISA("avx2")
39 inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40  {
41  /*
42  Behold. The key schedule progresses like so. The values
43  loop back to the originals after the rounds are complete
44  so we don't need to reload for starting the next block.
45 
46  R0 R1 R2
47  K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48  K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49  K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50 
51  K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52  K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53  K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54 
55  K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56  K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57  K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58 
59  To compute the values for the next round:
60  X0 is X2 from the last round
61  X1 becomes (X0[4],X1[1:3])
62  X2 becomes (X1[4],X2[1:3])
63 
64  Uses 3 permutes and 2 blends, is there a faster way?
65  */
66  __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67  __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68  __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69 
70  R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71  R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72  }
73 
74 
75 }
76 
77 BOTAN_FUNC_ISA("avx2")
78 void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79  {
80  _mm256_zeroupper();
81 
82  const uint64_t* K = m_K.data();
83  const uint64_t* T_64 = m_T.data();
84 
85  const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
86  const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
87  const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
88  const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
89  const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
90  const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
91  const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
92  const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
93 
94 #define THREEFISH_ROUND(X0, X1, SHL) \
95  do { \
96  const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
97  X0 = _mm256_add_epi64(X0, X1); \
98  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
99  X1 = _mm256_xor_si256(X1, X0); \
100  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
101  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
102  } while(0)
103 
104 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \
105  do { \
106  const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
107  X0 = _mm256_add_epi64(X0, X1); \
108  X2 = _mm256_add_epi64(X2, X3); \
109  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
110  X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
111  X1 = _mm256_xor_si256(X1, X0); \
112  X3 = _mm256_xor_si256(X3, X2); \
113  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
114  X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \
115  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
116  X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
117  } while(0)
118 
119 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
120  do { \
121  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
122  const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
123  X0 = _mm256_add_epi64(X0, K0); \
124  X1 = _mm256_add_epi64(X1, K1); \
125  X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \
126  X0 = _mm256_add_epi64(X0, T0); \
127  X1 = _mm256_add_epi64(X1, T1); \
128  } while(0)
129 
130 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
131  do { \
132  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
133  __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
134  X0 = _mm256_add_epi64(X0, K0); \
135  X2 = _mm256_add_epi64(X2, K0); \
136  X1 = _mm256_add_epi64(X1, K1); \
137  X3 = _mm256_add_epi64(X3, K1); \
138  T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
139  X0 = _mm256_add_epi64(X0, T0); \
140  X2 = _mm256_add_epi64(X2, T0); \
141  X1 = _mm256_add_epi64(X1, T1); \
142  X3 = _mm256_add_epi64(X3, T1); \
143  } while(0)
144 
145 #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \
146  do { \
147  rotate_keys(K1, K2, K0); \
148  THREEFISH_ROUND(X0, X1, ROTATE_1); \
149  THREEFISH_ROUND(X0, X1, ROTATE_2); \
150  THREEFISH_ROUND(X0, X1, ROTATE_3); \
151  THREEFISH_ROUND(X0, X1, ROTATE_4); \
152  THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \
153  \
154  THREEFISH_ROUND(X0, X1, ROTATE_5); \
155  THREEFISH_ROUND(X0, X1, ROTATE_6); \
156  THREEFISH_ROUND(X0, X1, ROTATE_7); \
157  THREEFISH_ROUND(X0, X1, ROTATE_8); \
158  THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \
159  } while(0)
160 
161 #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
162  do { \
163  rotate_keys(K1, K2, K0); \
164  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
165  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
166  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
167  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
168  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \
169  \
170  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
171  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
172  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
173  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
174  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \
175  } while(0)
176 
177  __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
178  __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
179  __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
180 
181  const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
182  __m256i* out_mm = reinterpret_cast<__m256i*>(out);
183 
184  while(blocks >= 2)
185  {
186  __m256i X0 = _mm256_loadu_si256(in_mm++);
187  __m256i X1 = _mm256_loadu_si256(in_mm++);
188  __m256i X2 = _mm256_loadu_si256(in_mm++);
189  __m256i X3 = _mm256_loadu_si256(in_mm++);
190 
191  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
192 
193  interleave_epi64(X0, X1);
194  interleave_epi64(X2, X3);
195 
196  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
197 
198  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3);
199  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1);
200  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2);
201  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3);
202  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1);
203  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
204  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
205  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
206  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
207 
208  deinterleave_epi64(X0, X1);
209  deinterleave_epi64(X2, X3);
210 
211  _mm256_storeu_si256(out_mm++, X0);
212  _mm256_storeu_si256(out_mm++, X1);
213  _mm256_storeu_si256(out_mm++, X2);
214  _mm256_storeu_si256(out_mm++, X3);
215 
216  blocks -= 2;
217  }
218 
219  for(size_t i = 0; i != blocks; ++i)
220  {
221  __m256i X0 = _mm256_loadu_si256(in_mm++);
222  __m256i X1 = _mm256_loadu_si256(in_mm++);
223 
224  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
225 
226  interleave_epi64(X0, X1);
227 
228  THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
229 
230  THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3);
231  THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1);
232  THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2);
233  THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3);
234  THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1);
235  THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
236  THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
237  THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
238  THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
239 
240  deinterleave_epi64(X0, X1);
241 
242  _mm256_storeu_si256(out_mm++, X0);
243  _mm256_storeu_si256(out_mm++, X1);
244  }
245 
246  _mm256_zeroall();
247 
248 #undef THREEFISH_ENC_8_ROUNDS
249 #undef THREEFISH_ROUND
250 #undef THREEFISH_INJECT_KEY
251 #undef THREEFISH_DEC_2_8_ROUNDS
252 #undef THREEFISH_ROUND_2
253 #undef THREEFISH_INJECT_KEY_2
254  }
255 
256 BOTAN_FUNC_ISA("avx2")
257 void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
258  {
259  _mm256_zeroupper();
260 
261  const uint64_t* K = m_K.data();
262  const uint64_t* T_64 = m_T.data();
263 
264  const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
265  const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
266  const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
267  const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
268  const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
269  const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
270  const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
271  const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
272 
273 #define THREEFISH_ROUND(X0, X1, SHR) \
274  do { \
275  const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
276  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
277  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
278  X1 = _mm256_xor_si256(X1, X0); \
279  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
280  X0 = _mm256_sub_epi64(X0, X1); \
281  } while(0)
282 
283 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \
284  do { \
285  const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
286  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
287  X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \
288  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
289  X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
290  X1 = _mm256_xor_si256(X1, X0); \
291  X3 = _mm256_xor_si256(X3, X2); \
292  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
293  X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
294  X0 = _mm256_sub_epi64(X0, X1); \
295  X2 = _mm256_sub_epi64(X2, X3); \
296  } while(0)
297 
298 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
299  do { \
300  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
301  const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
302  X0 = _mm256_sub_epi64(X0, K0); \
303  X1 = _mm256_sub_epi64(X1, K1); \
304  X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \
305  X0 = _mm256_sub_epi64(X0, T0); \
306  X1 = _mm256_sub_epi64(X1, T1); \
307  } while(0)
308 
309 #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
310  do { \
311  THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \
312  THREEFISH_ROUND(X0, X1, ROTATE_8); \
313  THREEFISH_ROUND(X0, X1, ROTATE_7); \
314  THREEFISH_ROUND(X0, X1, ROTATE_6); \
315  THREEFISH_ROUND(X0, X1, ROTATE_5); \
316  \
317  THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \
318  THREEFISH_ROUND(X0, X1, ROTATE_4); \
319  THREEFISH_ROUND(X0, X1, ROTATE_3); \
320  THREEFISH_ROUND(X0, X1, ROTATE_2); \
321  THREEFISH_ROUND(X0, X1, ROTATE_1); \
322  } while(0)
323 
324 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
325  do { \
326  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
327  __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
328  X0 = _mm256_sub_epi64(X0, K0); \
329  X2 = _mm256_sub_epi64(X2, K0); \
330  X1 = _mm256_sub_epi64(X1, K1); \
331  X3 = _mm256_sub_epi64(X3, K1); \
332  T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
333  X0 = _mm256_sub_epi64(X0, T0); \
334  X2 = _mm256_sub_epi64(X2, T0); \
335  X1 = _mm256_sub_epi64(X1, T1); \
336  X3 = _mm256_sub_epi64(X3, T1); \
337  } while(0)
338 
339 #define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
340  do { \
341  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \
342  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
343  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
344  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
345  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
346  \
347  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
348  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
349  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
350  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
351  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
352  } while(0)
353 
354  /*
355  v1.0 key schedule: 9 ymm registers (only need 2 or 3)
356  (0,1,2,3),(4,5,6,7) [8]
357  then mutating with vpermq
358  */
359  const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
360  const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
361  const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
362  const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
363  const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
364  const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
365  const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
366  const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
367  const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
368 
369  const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
370  __m256i* out_mm = reinterpret_cast<__m256i*>(out);
371 
372  while(blocks >= 2)
373  {
374  __m256i X0 = _mm256_loadu_si256(in_mm++);
375  __m256i X1 = _mm256_loadu_si256(in_mm++);
376  __m256i X2 = _mm256_loadu_si256(in_mm++);
377  __m256i X3 = _mm256_loadu_si256(in_mm++);
378 
379  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
380 
381  interleave_epi64(X0, X1);
382  interleave_epi64(X2, X3);
383 
384  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
385  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
386  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
387  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
388  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
389  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
390  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
391  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
392  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
393 
394  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
395 
396  deinterleave_epi64(X0, X1);
397  deinterleave_epi64(X2, X3);
398 
399  _mm256_storeu_si256(out_mm++, X0);
400  _mm256_storeu_si256(out_mm++, X1);
401  _mm256_storeu_si256(out_mm++, X2);
402  _mm256_storeu_si256(out_mm++, X3);
403 
404  blocks -= 2;
405  }
406 
407  for(size_t i = 0; i != blocks; ++i)
408  {
409  __m256i X0 = _mm256_loadu_si256(in_mm++);
410  __m256i X1 = _mm256_loadu_si256(in_mm++);
411 
412  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
413 
414  interleave_epi64(X0, X1);
415 
416  THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
417  THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
418  THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
419  THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
420  THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
421  THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
422  THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
423  THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
424  THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
425 
426  THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
427 
428  deinterleave_epi64(X0, X1);
429 
430  _mm256_storeu_si256(out_mm++, X0);
431  _mm256_storeu_si256(out_mm++, X1);
432  }
433 
434 #undef THREEFISH_DEC_8_ROUNDS
435 #undef THREEFISH_ROUND
436 #undef THREEFISH_INJECT_KEY
437 #undef THREEFISH_DEC_2_8_ROUNDS
438 #undef THREEFISH_ROUND_2
439 #undef THREEFISH_INJECT_KEY_2
440 
441  _mm256_zeroall();
442  }
443 
444 }
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2)
void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4") SM4 const uint32x4_t K1
Definition: sm4_armv8.cpp:54
const uint32x4_t K5
Definition: sm4_armv8.cpp:58
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)
const uint32x4_t K2
Definition: sm4_armv8.cpp:55
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2)
const uint32x4_t K4
Definition: sm4_armv8.cpp:57
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:77
const uint32x4_t K3
Definition: sm4_armv8.cpp:56
Definition: alg_id.cpp:13
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)
const uint32x4_t K6
Definition: sm4_armv8.cpp:59
fe T
Definition: ge.cpp:37
const uint32x4_t K7
Definition: sm4_armv8.cpp:60
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)