8 #include <botan/chacha.h>
15 void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t input[16],
size_t rounds)
19 const __m128i* input_mm =
reinterpret_cast<const __m128i*
>(input);
20 __m128i* output_mm =
reinterpret_cast<__m128i*
>(output);
22 __m128i input0 = _mm_loadu_si128(input_mm);
23 __m128i input1 = _mm_loadu_si128(input_mm + 1);
24 __m128i input2 = _mm_loadu_si128(input_mm + 2);
25 __m128i input3 = _mm_loadu_si128(input_mm + 3);
29 #define mm_rotl(r, n) \
30 _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n))
32 __m128i r0_0 = input0;
33 __m128i r0_1 = input1;
34 __m128i r0_2 = input2;
35 __m128i r0_3 = input3;
37 __m128i r1_0 = input0;
38 __m128i r1_1 = input1;
39 __m128i r1_2 = input2;
40 __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
42 __m128i r2_0 = input0;
43 __m128i r2_1 = input1;
44 __m128i r2_2 = input2;
45 __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
47 __m128i r3_0 = input0;
48 __m128i r3_1 = input1;
49 __m128i r3_2 = input2;
50 __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
52 for(
size_t r = 0; r != rounds / 2; ++r)
54 r0_0 = _mm_add_epi32(r0_0, r0_1);
55 r1_0 = _mm_add_epi32(r1_0, r1_1);
56 r2_0 = _mm_add_epi32(r2_0, r2_1);
57 r3_0 = _mm_add_epi32(r3_0, r3_1);
59 r0_3 = _mm_xor_si128(r0_3, r0_0);
60 r1_3 = _mm_xor_si128(r1_3, r1_0);
61 r2_3 = _mm_xor_si128(r2_3, r2_0);
62 r3_3 = _mm_xor_si128(r3_3, r3_0);
69 r0_2 = _mm_add_epi32(r0_2, r0_3);
70 r1_2 = _mm_add_epi32(r1_2, r1_3);
71 r2_2 = _mm_add_epi32(r2_2, r2_3);
72 r3_2 = _mm_add_epi32(r3_2, r3_3);
74 r0_1 = _mm_xor_si128(r0_1, r0_2);
75 r1_1 = _mm_xor_si128(r1_1, r1_2);
76 r2_1 = _mm_xor_si128(r2_1, r2_2);
77 r3_1 = _mm_xor_si128(r3_1, r3_2);
84 r0_0 = _mm_add_epi32(r0_0, r0_1);
85 r1_0 = _mm_add_epi32(r1_0, r1_1);
86 r2_0 = _mm_add_epi32(r2_0, r2_1);
87 r3_0 = _mm_add_epi32(r3_0, r3_1);
89 r0_3 = _mm_xor_si128(r0_3, r0_0);
90 r1_3 = _mm_xor_si128(r1_3, r1_0);
91 r2_3 = _mm_xor_si128(r2_3, r2_0);
92 r3_3 = _mm_xor_si128(r3_3, r3_0);
99 r0_2 = _mm_add_epi32(r0_2, r0_3);
100 r1_2 = _mm_add_epi32(r1_2, r1_3);
101 r2_2 = _mm_add_epi32(r2_2, r2_3);
102 r3_2 = _mm_add_epi32(r3_2, r3_3);
104 r0_1 = _mm_xor_si128(r0_1, r0_2);
105 r1_1 = _mm_xor_si128(r1_1, r1_2);
106 r2_1 = _mm_xor_si128(r2_1, r2_2);
107 r3_1 = _mm_xor_si128(r3_1, r3_2);
114 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
115 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
116 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
118 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
119 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
120 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
122 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
123 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
124 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
126 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
127 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
128 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
130 r0_0 = _mm_add_epi32(r0_0, r0_1);
131 r1_0 = _mm_add_epi32(r1_0, r1_1);
132 r2_0 = _mm_add_epi32(r2_0, r2_1);
133 r3_0 = _mm_add_epi32(r3_0, r3_1);
135 r0_3 = _mm_xor_si128(r0_3, r0_0);
136 r1_3 = _mm_xor_si128(r1_3, r1_0);
137 r2_3 = _mm_xor_si128(r2_3, r2_0);
138 r3_3 = _mm_xor_si128(r3_3, r3_0);
145 r0_2 = _mm_add_epi32(r0_2, r0_3);
146 r1_2 = _mm_add_epi32(r1_2, r1_3);
147 r2_2 = _mm_add_epi32(r2_2, r2_3);
148 r3_2 = _mm_add_epi32(r3_2, r3_3);
150 r0_1 = _mm_xor_si128(r0_1, r0_2);
151 r1_1 = _mm_xor_si128(r1_1, r1_2);
152 r2_1 = _mm_xor_si128(r2_1, r2_2);
153 r3_1 = _mm_xor_si128(r3_1, r3_2);
160 r0_0 = _mm_add_epi32(r0_0, r0_1);
161 r1_0 = _mm_add_epi32(r1_0, r1_1);
162 r2_0 = _mm_add_epi32(r2_0, r2_1);
163 r3_0 = _mm_add_epi32(r3_0, r3_1);
165 r0_3 = _mm_xor_si128(r0_3, r0_0);
166 r1_3 = _mm_xor_si128(r1_3, r1_0);
167 r2_3 = _mm_xor_si128(r2_3, r2_0);
168 r3_3 = _mm_xor_si128(r3_3, r3_0);
175 r0_2 = _mm_add_epi32(r0_2, r0_3);
176 r1_2 = _mm_add_epi32(r1_2, r1_3);
177 r2_2 = _mm_add_epi32(r2_2, r2_3);
178 r3_2 = _mm_add_epi32(r3_2, r3_3);
180 r0_1 = _mm_xor_si128(r0_1, r0_2);
181 r1_1 = _mm_xor_si128(r1_1, r1_2);
182 r2_1 = _mm_xor_si128(r2_1, r2_2);
183 r3_1 = _mm_xor_si128(r3_1, r3_2);
190 r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
191 r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
192 r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
194 r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
195 r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
196 r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
198 r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
199 r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
200 r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
202 r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
203 r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
204 r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
207 r0_0 = _mm_add_epi32(r0_0, input0);
208 r0_1 = _mm_add_epi32(r0_1, input1);
209 r0_2 = _mm_add_epi32(r0_2, input2);
210 r0_3 = _mm_add_epi32(r0_3, input3);
212 r1_0 = _mm_add_epi32(r1_0, input0);
213 r1_1 = _mm_add_epi32(r1_1, input1);
214 r1_2 = _mm_add_epi32(r1_2, input2);
215 r1_3 = _mm_add_epi32(r1_3, input3);
216 r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
218 r2_0 = _mm_add_epi32(r2_0, input0);
219 r2_1 = _mm_add_epi32(r2_1, input1);
220 r2_2 = _mm_add_epi32(r2_2, input2);
221 r2_3 = _mm_add_epi32(r2_3, input3);
222 r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
224 r3_0 = _mm_add_epi32(r3_0, input0);
225 r3_1 = _mm_add_epi32(r3_1, input1);
226 r3_2 = _mm_add_epi32(r3_2, input2);
227 r3_3 = _mm_add_epi32(r3_3, input3);
228 r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
230 _mm_storeu_si128(output_mm + 0, r0_0);
231 _mm_storeu_si128(output_mm + 1, r0_1);
232 _mm_storeu_si128(output_mm + 2, r0_2);
233 _mm_storeu_si128(output_mm + 3, r0_3);
235 _mm_storeu_si128(output_mm + 4, r1_0);
236 _mm_storeu_si128(output_mm + 5, r1_1);
237 _mm_storeu_si128(output_mm + 6, r1_2);
238 _mm_storeu_si128(output_mm + 7, r1_3);
240 _mm_storeu_si128(output_mm + 8, r2_0);
241 _mm_storeu_si128(output_mm + 9, r2_1);
242 _mm_storeu_si128(output_mm + 10, r2_2);
243 _mm_storeu_si128(output_mm + 11, r2_3);
245 _mm_storeu_si128(output_mm + 12, r3_0);
246 _mm_storeu_si128(output_mm + 13, r3_1);
247 _mm_storeu_si128(output_mm + 14, r3_2);
248 _mm_storeu_si128(output_mm + 15, r3_3);
#define BOTAN_ASSERT(expr, assertion_made)
#define BOTAN_FUNC_ISA(isa)