Botan  2.13.0
Crypto and TLS for C++11
clmul_cpu.cpp
Go to the documentation of this file.
1 /*
2 * Hook for CLMUL/PMULL
3 * (C) 2013,2017,2019 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/internal/clmul_cpu.h>
9 #include <botan/internal/simd_32.h>
10 
11 #if defined(BOTAN_SIMD_USE_SSE2)
12  #include <immintrin.h>
13  #include <wmmintrin.h>
14 #endif
15 
16 namespace Botan {
17 
18 namespace {
19 
20 BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) reverse_vector(const SIMD_4x32& in)
21  {
22 #if defined(BOTAN_SIMD_USE_SSE2)
23  const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
24  return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
25 #elif defined(BOTAN_SIMD_USE_NEON)
26  const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
27  const uint8x16_t mask = vld1q_u8(maskb);
28  return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
29 #endif
30  }
31 
32 template<int M>
33 BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x)
34  {
35  static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
36 
37 #if defined(BOTAN_SIMD_USE_SSE2)
38  return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
39 #elif defined(BOTAN_SIMD_USE_NEON)
40  const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
41  const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
42  return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
43 #endif
44  }
45 
46 inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1)
47  {
48  SIMD_4x32 X0 = B1.shr<31>();
49  SIMD_4x32 X1 = B1.shl<1>();
50  SIMD_4x32 X2 = B0.shr<31>();
51  SIMD_4x32 X3 = B0.shl<1>();
52 
53  X3 |= X0.shift_elems_right<3>();
54  X3 |= X2.shift_elems_left<1>();
55  X1 |= X0.shift_elems_left<1>();
56 
57  X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
58 
59  X1 ^= X0.shift_elems_left<3>();
60 
61  X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
62  X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
63  return X0;
64  }
65 
66 inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x)
67  {
68  SIMD_4x32 T0 = clmul<0x11>(H, x);
69  SIMD_4x32 T1 = clmul<0x10>(H, x);
70  SIMD_4x32 T2 = clmul<0x01>(H, x);
71  SIMD_4x32 T3 = clmul<0x00>(H, x);
72 
73  T1 ^= T2;
74  T0 ^= T1.shift_elems_right<2>();
75  T3 ^= T1.shift_elems_left<2>();
76 
77  return gcm_reduce(T0, T3);
78  }
79 
80 inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA)
81  gcm_multiply_x4(const SIMD_4x32& H1, const SIMD_4x32& H2, const SIMD_4x32& H3, const SIMD_4x32& H4,
82  const SIMD_4x32& X1, const SIMD_4x32& X2, const SIMD_4x32& X3, const SIMD_4x32& X4)
83  {
84  /*
85  * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
86  * and Pierre Laurent of Intel
87  */
88 
89  const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^
90  (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
91 
92  const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^
93  (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
94 
95  SIMD_4x32 T;
96 
97  T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
98  T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
99  T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
100  T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
101  T ^= lo;
102  T ^= hi;
103 
104  return gcm_reduce(hi ^ T.shift_elems_right<2>(),
105  lo ^ T.shift_elems_left<2>());
106  }
107 
108 }
109 
110 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
111 void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
112  {
113  const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
114  const SIMD_4x32 H2 = gcm_multiply(H1, H1);
115  const SIMD_4x32 H3 = gcm_multiply(H1, H2);
116  const SIMD_4x32 H4 = gcm_multiply(H2, H2);
117 
118  H1.store_le(H_pow);
119  H2.store_le(H_pow + 2);
120  H3.store_le(H_pow + 4);
121  H4.store_le(H_pow + 6);
122  }
123 
124 BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
125 void gcm_multiply_clmul(uint8_t x[16],
126  const uint64_t H_pow[8],
127  const uint8_t input[], size_t blocks)
128  {
129  /*
130  * Algorithms 1 and 5 from Intel's CLMUL guide
131  */
132  const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);
133 
134  SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
135 
136  if(blocks >= 4)
137  {
138  const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
139  const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
140  const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);
141 
142  while(blocks >= 4)
143  {
144  const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input ));
145  const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16*1));
146  const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16*2));
147  const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16*3));
148 
149  a ^= m0;
150  a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
151 
152  input += 4*16;
153  blocks -= 4;
154  }
155  }
156 
157  for(size_t i = 0; i != blocks; ++i)
158  {
159  const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16*i));
160 
161  a ^= m;
162  a = gcm_multiply(H1, a);
163  }
164 
165  a = reverse_vector(a);
166  a.store_le(x);
167  }
168 
169 }
void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4 *2])
Definition: clmul_cpu.cpp:111
#define BOTAN_FORCE_INLINE
Definition: compiler.h:208
static SIMD_4x32 load_le(const void *in)
Definition: simd_32.h:159
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:71
Definition: alg_id.cpp:13
void gcm_multiply_clmul(uint8_t x[16], const uint64_t H_pow[8], const uint8_t input[], size_t blocks)
Definition: clmul_cpu.cpp:125
fe T
Definition: ge.cpp:37
void store_le(uint32_t out[4]) const
Definition: simd_32.h:192