10 #include <botan/sha160.h>
11 #include <emmintrin.h>
15 namespace SHA1_SSE2_F {
24 #define prep00_15(P, W) \
26 W = _mm_shufflehi_epi16(W, _MM_SHUFFLE(2, 3, 0, 1)); \
27 W = _mm_shufflelo_epi16(W, _MM_SHUFFLE(2, 3, 0, 1)); \
28 W = _mm_or_si128(_mm_slli_epi16(W, 8), \
29 _mm_srli_epi16(W, 8)); \
30 P.u128 = _mm_add_epi32(W, K00_19); \
78 #define prep(prep, XW0, XW1, XW2, XW3, K) \
80 __m128i r0, r1, r2, r3; \
83 r3 = _mm_srli_si128((XW3), 4); \
86 r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2)); \
88 r1 = _mm_unpacklo_epi64(r1, (XW1)); \
91 r0 = _mm_xor_si128(r1, r0); \
92 r2 = _mm_xor_si128(r3, r2); \
93 r0 = _mm_xor_si128(r2, r0); \
96 r2 = _mm_slli_si128(r0, 12); \
97 r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128()); \
98 r0 = _mm_add_epi32(r0, r0); \
99 r0 = _mm_sub_epi32(r0, r1); \
101 r3 = _mm_srli_epi32(r2, 30); \
102 r2 = _mm_slli_epi32(r2, 2); \
104 r0 = _mm_xor_si128(r0, r3); \
105 r0 = _mm_xor_si128(r0, r2); \
108 (prep).u128 = _mm_add_epi32(r0, K); \
114 inline void F1(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
116 E += (D ^ (B & (C ^ D))) + msg +
rotate_left(A, 5);
123 inline void F2(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
132 inline void F3(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
134 E += ((B & C) | ((B | C) & D)) + msg +
rotate_left(A, 5);
141 inline void F4(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
156 void SHA_160::sse2_compress_n(
secure_vector<uint32_t>& digest, const uint8_t input[],
size_t blocks)
158 using namespace SHA1_SSE2_F;
160 const __m128i K00_19 = _mm_set1_epi32(0x5A827999);
161 const __m128i K20_39 = _mm_set1_epi32(0x6ED9EBA1);
162 const __m128i K40_59 = _mm_set1_epi32(0x8F1BBCDC);
163 const __m128i K60_79 = _mm_set1_epi32(0xCA62C1D6);
165 uint32_t A = digest[0],
171 const __m128i* input_mm =
reinterpret_cast<const __m128i*
>(input);
173 for(
size_t i = 0; i != blocks; ++i)
182 __m128i W0 = _mm_loadu_si128(&input_mm[0]);
185 __m128i W1 = _mm_loadu_si128(&input_mm[1]);
188 __m128i W2 = _mm_loadu_si128(&input_mm[2]);
191 __m128i W3 = _mm_loadu_si128(&input_mm[3]);
202 #define GET_P_32(P, i) P.u32[i]
208 prep(P0, W0, W1, W2, W3, K00_19);
214 prep(P1, W1, W2, W3, W0, K20_39);
220 prep(P2, W2, W3, W0, W1, K20_39);
226 prep(P3, W3, W0, W1, W2, K20_39);
232 prep(P0, W0, W1, W2, W3, K20_39);
238 prep(P1, W1, W2, W3, W0, K20_39);
244 prep(P2, W2, W3, W0, W1, K40_59);
250 prep(P3, W3, W0, W1, W2, K40_59);
256 prep(P0, W0, W1, W2, W3, K40_59);
262 prep(P1, W1, W2, W3, W0, K40_59);
268 prep(P2, W2, W3, W0, W1, K40_59);
274 prep(P3, W3, W0, W1, W2, K60_79);
280 prep(P0, W0, W1, W2, W3, K60_79);
286 prep(P1, W1, W2, W3, W0, K60_79);
292 prep(P2, W2, W3, W0, W1, K60_79);
298 prep(P3, W3, W0, W1, W2, K60_79);
320 A = (digest[0] += A);
321 B = (digest[1] += B);
322 C = (digest[2] += C);
323 D = (digest[3] += D);
324 E = (digest[4] += E);
326 input_mm += (64 / 16);
T rotate_left(T input, size_t rot)
std::vector< T, secure_allocator< T >> secure_vector
#define prep(prep, XW0, XW1, XW2, XW3, K)
#define BOTAN_FUNC_ISA(isa)