8 #ifndef BOTAN_SIMD_32_H__
9 #define BOTAN_SIMD_32_H__
11 #include <botan/types.h>
12 #include <botan/loadstor.h>
13 #include <botan/bswap.h>
14 #include <botan/cpuid.h>
16 #if defined(BOTAN_TARGET_SUPPORTS_SSE2)
17 #include <emmintrin.h>
18 #define BOTAN_SIMD_USE_SSE2
20 #elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
24 #define BOTAN_SIMD_USE_ALTIVEC
26 #elif defined(BOTAN_TARGET_SUPPORTS_NEON)
28 #define BOTAN_SIMD_USE_NEON
50 #if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013)
60 #if defined(BOTAN_SIMD_USE_SSE2)
61 ::memset(&m_sse, 0,
sizeof(m_sse));
62 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
63 m_vmx = vec_splat_u32(0);
64 #elif defined(BOTAN_SIMD_USE_NEON)
65 m_neon = vdupq_n_u32(0);
67 ::memset(m_scalar, 0,
sizeof(m_scalar));
76 #if defined(BOTAN_SIMD_USE_SSE2)
77 m_sse = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B));
78 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
79 m_vmx = (__vector
unsigned int){B[0], B[1], B[2], B[3]};
80 #elif defined(BOTAN_SIMD_USE_NEON)
81 m_neon = vld1q_u32(B);
93 SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
95 #if defined(BOTAN_SIMD_USE_SSE2)
96 m_sse = _mm_set_epi32(B3, B2, B1, B0);
97 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
98 m_vmx = (__vector
unsigned int){B0, B1, B2, B3};
99 #elif defined(BOTAN_SIMD_USE_NEON)
101 const uint32_t B[4] = { B0, B1, B2, B3 };
102 m_neon = vld1q_u32(B);
116 #if defined(BOTAN_SIMD_USE_SSE2)
118 #elif defined(BOTAN_SIMD_USE_ARM)
130 #if defined(BOTAN_SIMD_USE_SSE2)
131 return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
132 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
133 const uint32_t* in_32 =
static_cast<const uint32_t*
>(in);
135 __vector
unsigned int R0 = vec_ld(0, in_32);
136 __vector
unsigned int R1 = vec_ld(12, in_32);
138 __vector
unsigned char perm = vec_lvsl(0, in_32);
142 perm = vec_xor(perm, vec_splat_u8(3));
145 R0 = vec_perm(R0, R1, perm);
148 #elif defined(BOTAN_SIMD_USE_NEON)
151 std::memcpy(in32, in, 16);
170 #if defined(BOTAN_SIMD_USE_SSE2)
174 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
176 const uint32_t* in_32 =
static_cast<const uint32_t*
>(in);
177 __vector
unsigned int R0 = vec_ld(0, in_32);
178 __vector
unsigned int R1 = vec_ld(12, in_32);
179 __vector
unsigned char perm = vec_lvsl(0, in_32);
183 perm = vec_xor(perm, vec_splat_u8(3));
186 R0 = vec_perm(R0, R1, perm);
189 #elif defined(BOTAN_SIMD_USE_NEON)
192 std::memcpy(in32, in, 16);
211 #if defined(BOTAN_SIMD_USE_SSE2)
213 _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_sse);
215 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
217 __vector
unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(
nullptr));
220 perm = vec_xor(perm, vec_splat_u8(3));
224 __vector
unsigned int V;
227 vec.V = vec_perm(m_vmx, m_vmx, perm);
230 #elif defined(BOTAN_SIMD_USE_NEON)
239 uint32_t out32[4] = { 0 };
240 vst1q_u32(out32, m_neon);
244 Botan::store_le(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
253 #if defined(BOTAN_SIMD_USE_SSE2)
257 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
260 __vector
unsigned int V;
266 #elif defined(BOTAN_SIMD_USE_NEON)
275 uint32_t out32[4] = { 0 };
276 vst1q_u32(out32, m_neon);
281 Botan::store_be(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
290 #if defined(BOTAN_SIMD_USE_SSE2)
292 m_sse = _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(rot)),
293 _mm_srli_epi32(m_sse, static_cast<int>(32-rot)));
295 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
297 const unsigned int r =
static_cast<unsigned int>(rot);
298 m_vmx = vec_rl(m_vmx, (__vector
unsigned int){r, r, r, r});
300 #elif defined(BOTAN_SIMD_USE_NEON)
301 m_neon = vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(rot)),
302 vshrq_n_u32(m_neon, static_cast<int>(32-rot)));
372 #if defined(BOTAN_SIMD_USE_SSE2)
373 m_sse = _mm_add_epi32(m_sse, other.m_sse);
374 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
375 m_vmx = vec_add(m_vmx, other.m_vmx);
376 #elif defined(BOTAN_SIMD_USE_NEON)
377 m_neon = vaddq_u32(m_neon, other.m_neon);
379 m_scalar[0] += other.m_scalar[0];
380 m_scalar[1] += other.m_scalar[1];
381 m_scalar[2] += other.m_scalar[2];
382 m_scalar[3] += other.m_scalar[3];
388 #if defined(BOTAN_SIMD_USE_SSE2)
389 m_sse = _mm_sub_epi32(m_sse, other.m_sse);
390 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
391 m_vmx = vec_sub(m_vmx, other.m_vmx);
392 #elif defined(BOTAN_SIMD_USE_NEON)
393 m_neon = vsubq_u32(m_neon, other.m_neon);
395 m_scalar[0] -= other.m_scalar[0];
396 m_scalar[1] -= other.m_scalar[1];
397 m_scalar[2] -= other.m_scalar[2];
398 m_scalar[3] -= other.m_scalar[3];
404 #if defined(BOTAN_SIMD_USE_SSE2)
405 m_sse = _mm_xor_si128(m_sse, other.m_sse);
407 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
408 m_vmx = vec_xor(m_vmx, other.m_vmx);
409 #elif defined(BOTAN_SIMD_USE_NEON)
410 m_neon = veorq_u32(m_neon, other.m_neon);
412 m_scalar[0] ^= other.m_scalar[0];
413 m_scalar[1] ^= other.m_scalar[1];
414 m_scalar[2] ^= other.m_scalar[2];
415 m_scalar[3] ^= other.m_scalar[3];
421 #if defined(BOTAN_SIMD_USE_SSE2)
422 m_sse = _mm_or_si128(m_sse, other.m_sse);
423 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
424 m_vmx = vec_or(m_vmx, other.m_vmx);
425 #elif defined(BOTAN_SIMD_USE_NEON)
426 m_neon = vorrq_u32(m_neon, other.m_neon);
428 m_scalar[0] |= other.m_scalar[0];
429 m_scalar[1] |= other.m_scalar[1];
430 m_scalar[2] |= other.m_scalar[2];
431 m_scalar[3] |= other.m_scalar[3];
437 #if defined(BOTAN_SIMD_USE_SSE2)
438 m_sse = _mm_and_si128(m_sse, other.m_sse);
439 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
440 m_vmx = vec_and(m_vmx, other.m_vmx);
441 #elif defined(BOTAN_SIMD_USE_NEON)
442 m_neon = vandq_u32(m_neon, other.m_neon);
444 m_scalar[0] &= other.m_scalar[0];
445 m_scalar[1] &= other.m_scalar[1];
446 m_scalar[2] &= other.m_scalar[2];
447 m_scalar[3] &= other.m_scalar[3];
453 #if defined(BOTAN_SIMD_USE_SSE2)
454 return SIMD_4x32(_mm_slli_epi32(m_sse, static_cast<int>(shift)));
456 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
457 const unsigned int s =
static_cast<unsigned int>(shift);
458 return SIMD_4x32(vec_sl(m_vmx, (__vector
unsigned int){s, s, s, s}));
459 #elif defined(BOTAN_SIMD_USE_NEON)
460 return SIMD_4x32(vshlq_n_u32(m_neon, static_cast<int>(shift)));
463 m_scalar[1] << shift,
464 m_scalar[2] << shift,
465 m_scalar[3] << shift);
471 #if defined(BOTAN_SIMD_USE_SSE2)
472 return SIMD_4x32(_mm_srli_epi32(m_sse, static_cast<int>(shift)));
474 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
475 const unsigned int s =
static_cast<unsigned int>(shift);
476 return SIMD_4x32(vec_sr(m_vmx, (__vector
unsigned int){s, s, s, s}));
477 #elif defined(BOTAN_SIMD_USE_NEON)
478 return SIMD_4x32(vshrq_n_u32(m_neon, static_cast<int>(shift)));
480 return SIMD_4x32(m_scalar[0] >> shift, m_scalar[1] >> shift,
481 m_scalar[2] >> shift, m_scalar[3] >> shift);
488 #if defined(BOTAN_SIMD_USE_SSE2)
489 return SIMD_4x32(_mm_xor_si128(m_sse, _mm_set1_epi32(0xFFFFFFFF)));
490 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
492 #elif defined(BOTAN_SIMD_USE_NEON)
495 return SIMD_4x32(~m_scalar[0], ~m_scalar[1], ~m_scalar[2], ~m_scalar[3]);
502 #if defined(BOTAN_SIMD_USE_SSE2)
503 return SIMD_4x32(_mm_andnot_si128(m_sse, other.m_sse));
504 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
509 return SIMD_4x32(vec_andc(other.m_vmx, m_vmx));
510 #elif defined(BOTAN_SIMD_USE_NEON)
512 return SIMD_4x32(vbicq_u32(other.m_neon, m_neon));
514 return SIMD_4x32((~m_scalar[0]) & other.m_scalar[0],
515 (~m_scalar[1]) & other.m_scalar[1],
516 (~m_scalar[2]) & other.m_scalar[2],
517 (~m_scalar[3]) & other.m_scalar[3]);
526 #if defined(BOTAN_SIMD_USE_SSE2)
529 T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
530 T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
531 return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8)));
533 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
535 __vector
unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(
nullptr));
536 perm = vec_xor(perm, vec_splat_u8(3));
537 return SIMD_4x32(vec_perm(m_vmx, m_vmx, perm));
539 #elif defined(BOTAN_SIMD_USE_NEON)
551 return (ror8 & mask1) | (rol8 & mask2);
567 #if defined(BOTAN_SIMD_USE_SSE2)
568 const __m128i T0 = _mm_unpacklo_epi32(B0.m_sse, B1.m_sse);
569 const __m128i T1 = _mm_unpacklo_epi32(B2.m_sse, B3.m_sse);
570 const __m128i T2 = _mm_unpackhi_epi32(B0.m_sse, B1.m_sse);
571 const __m128i T3 = _mm_unpackhi_epi32(B2.m_sse, B3.m_sse);
573 B0.m_sse = _mm_unpacklo_epi64(T0, T1);
574 B1.m_sse = _mm_unpackhi_epi64(T0, T1);
575 B2.m_sse = _mm_unpacklo_epi64(T2, T3);
576 B3.m_sse = _mm_unpackhi_epi64(T2, T3);
577 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
578 const __vector
unsigned int T0 = vec_mergeh(B0.m_vmx, B2.m_vmx);
579 const __vector
unsigned int T1 = vec_mergeh(B1.m_vmx, B3.m_vmx);
580 const __vector
unsigned int T2 = vec_mergel(B0.m_vmx, B2.m_vmx);
581 const __vector
unsigned int T3 = vec_mergel(B1.m_vmx, B3.m_vmx);
583 B0.m_vmx = vec_mergeh(T0, T1);
584 B1.m_vmx = vec_mergel(T0, T1);
585 B2.m_vmx = vec_mergeh(T2, T3);
586 B3.m_vmx = vec_mergel(T2, T3);
587 #elif defined(BOTAN_SIMD_USE_NEON)
589 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
591 const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon);
592 const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon);
593 const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
594 const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
596 B0.m_neon = O0.val[0];
597 B1.m_neon = O0.val[1];
598 B2.m_neon = O1.val[0];
599 B3.m_neon = O1.val[1];
601 #elif defined(BOTAN_TARGET_ARCH_IS_ARM64)
602 const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon);
603 const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon);
605 const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon);
606 const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon);
608 B0.m_neon = vzip1q_u32(T0, T1);
609 B1.m_neon = vzip2q_u32(T0, T1);
611 B2.m_neon = vzip1q_u32(T2, T3);
612 B3.m_neon = vzip2q_u32(T2, T3);
617 SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]);
618 SIMD_4x32 T1(B0.m_scalar[1], B1.m_scalar[1], B2.m_scalar[1], B3.m_scalar[1]);
619 SIMD_4x32 T2(B0.m_scalar[2], B1.m_scalar[2], B2.m_scalar[2], B3.m_scalar[2]);
620 SIMD_4x32 T3(B0.m_scalar[3], B1.m_scalar[3], B2.m_scalar[3], B3.m_scalar[3]);
631 #if defined(BOTAN_SIMD_USE_SSE2)
632 explicit SIMD_4x32(__m128i in) : m_sse(in) {}
633 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
634 explicit SIMD_4x32(__vector
unsigned int in) : m_vmx(in) {}
635 #elif defined(BOTAN_SIMD_USE_NEON)
636 explicit SIMD_4x32(uint32x4_t in) : m_neon(in) {}
639 #if defined(BOTAN_SIMD_USE_SSE2)
641 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
642 __vector
unsigned int m_vmx;
643 #elif defined(BOTAN_SIMD_USE_NEON)
646 uint32_t m_scalar[4];
SIMD_4x32(const uint32_t B[4])
SIMD_4x32 operator~() const
SIMD_4x32 operator&(const SIMD_4x32 &other) const
void store_be(uint16_t in, uint8_t out[2])
void copy_out_le(uint8_t out[], size_t out_bytes, const T in[])
static SIMD_4x32 load_le(const void *in)
T rotate_left(T input, size_t rot)
void rotate_right(size_t rot)
void operator^=(const SIMD_4x32 &other)
SIMD_4x32 operator^(const SIMD_4x32 &other) const
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3)
void operator+=(const SIMD_4x32 &other)
T load_be(const uint8_t in[], size_t off)
SIMD_4x32 operator>>(size_t shift) const
SIMD_4x32 andc(const SIMD_4x32 &other) const
static SIMD_4x32 load_be(const void *in)
void store_le(uint8_t out[]) const
SIMD_4x32 operator-(const SIMD_4x32 &other) const
T load_le(const uint8_t in[], size_t off)
static bool is_little_endian()
uint16_t reverse_bytes(uint16_t val)
static SIMD_4x32 splat(uint32_t B)
SIMD_4x32 operator|(const SIMD_4x32 &other) const
SIMD_4x32 operator+(const SIMD_4x32 &other) const
void operator|=(const SIMD_4x32 &other)
SIMD_4x32 operator<<(size_t shift) const
void operator-=(const SIMD_4x32 &other)
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
static bool is_big_endian()
void operator&=(const SIMD_4x32 &other)
void rotate_left(size_t rot)
void store_be(uint8_t out[]) const
void copy_out_be(uint8_t out[], size_t out_bytes, const T in[])
void store_le(uint16_t in, uint8_t out[2])