Botan  2.19.1
Crypto and TLS for C++11
sm4_armv8.cpp
Go to the documentation of this file.
1 /*
2 * (C) 2018 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6 
7 #include <botan/sm4.h>
8 #include <arm_neon.h>
9 
10 namespace Botan {
11 
12 namespace {
13 
14 static const uint8_t qswap_tbl[16] = {
15  12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
16 };
17 
18 static const uint8_t bswap_tbl[16] = {
19  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
20 };
21 
22 inline uint32x4_t qswap_32(uint32x4_t B)
23  {
24  return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(qswap_tbl)));
25  }
26 
27 inline uint32x4_t bswap_32(uint32x4_t B)
28  {
29  return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B)));
30  }
31 
32 /*
33  Swap both the quad-words and bytes within each word
34  equivalent to return bswap_32(qswap_32(B))
35 */
36 inline uint32x4_t bqswap_32(uint32x4_t B)
37  {
38  return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(bswap_tbl)));
39  }
40 
41 #define SM4_E(B0, B1, B2, B3, K) do { \
42  B0 = vsm4eq_u32(B0, K); \
43  B1 = vsm4eq_u32(B1, K); \
44  B2 = vsm4eq_u32(B2, K); \
45  B3 = vsm4eq_u32(B3, K); \
46  } while(0)
47 
48 }
49 
50 void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4")
51 SM4::sm4_armv8_encrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
52  {
53  const uint32x4_t K0 = vld1q_u32(&m_RK[ 0]);
54  const uint32x4_t K1 = vld1q_u32(&m_RK[ 4]);
55  const uint32x4_t K2 = vld1q_u32(&m_RK[ 8]);
56  const uint32x4_t K3 = vld1q_u32(&m_RK[12]);
57  const uint32x4_t K4 = vld1q_u32(&m_RK[16]);
58  const uint32x4_t K5 = vld1q_u32(&m_RK[20]);
59  const uint32x4_t K6 = vld1q_u32(&m_RK[24]);
60  const uint32x4_t K7 = vld1q_u32(&m_RK[28]);
61 
62  const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
63  uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
64 
65  while(blocks >= 4)
66  {
67  uint32x4_t B0 = bswap_32(vld1q_u32(input32));
68  uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
69  uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
70  uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
71 
72  SM4_E(B0, B1, B2, B3, K0);
73  SM4_E(B0, B1, B2, B3, K1);
74  SM4_E(B0, B1, B2, B3, K2);
75  SM4_E(B0, B1, B2, B3, K3);
76  SM4_E(B0, B1, B2, B3, K4);
77  SM4_E(B0, B1, B2, B3, K5);
78  SM4_E(B0, B1, B2, B3, K6);
79  SM4_E(B0, B1, B2, B3, K7);
80 
81  vst1q_u32(output32 , bqswap_32(B0));
82  vst1q_u32(output32+ 4, bqswap_32(B1));
83  vst1q_u32(output32+ 8, bqswap_32(B2));
84  vst1q_u32(output32+12, bqswap_32(B3));
85 
86  input32 += 4*4;
87  output32 += 4*4;
88  blocks -= 4;
89  }
90 
91  for(size_t i = 0; i != blocks; ++i)
92  {
93  uint32x4_t B = bswap_32(vld1q_u32(input32));
94 
95  B = vsm4eq_u32(B, K0);
96  B = vsm4eq_u32(B, K1);
97  B = vsm4eq_u32(B, K2);
98  B = vsm4eq_u32(B, K3);
99  B = vsm4eq_u32(B, K4);
100  B = vsm4eq_u32(B, K5);
101  B = vsm4eq_u32(B, K6);
102  B = vsm4eq_u32(B, K7);
103 
104  vst1q_u32(output32, bqswap_32(B));
105 
106  input32 += 4;
107  output32 += 4;
108  }
109  }
110 
111 void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4")
112 SM4::sm4_armv8_decrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
113  {
114  const uint32x4_t K0 = qswap_32(vld1q_u32(&m_RK[ 0]));
115  const uint32x4_t K1 = qswap_32(vld1q_u32(&m_RK[ 4]));
116  const uint32x4_t K2 = qswap_32(vld1q_u32(&m_RK[ 8]));
117  const uint32x4_t K3 = qswap_32(vld1q_u32(&m_RK[12]));
118  const uint32x4_t K4 = qswap_32(vld1q_u32(&m_RK[16]));
119  const uint32x4_t K5 = qswap_32(vld1q_u32(&m_RK[20]));
120  const uint32x4_t K6 = qswap_32(vld1q_u32(&m_RK[24]));
121  const uint32x4_t K7 = qswap_32(vld1q_u32(&m_RK[28]));
122 
123  const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
124  uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
125 
126  while(blocks >= 4)
127  {
128  uint32x4_t B0 = bswap_32(vld1q_u32(input32));
129  uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
130  uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
131  uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
132 
133  SM4_E(B0, B1, B2, B3, K7);
134  SM4_E(B0, B1, B2, B3, K6);
135  SM4_E(B0, B1, B2, B3, K5);
136  SM4_E(B0, B1, B2, B3, K4);
137  SM4_E(B0, B1, B2, B3, K3);
138  SM4_E(B0, B1, B2, B3, K2);
139  SM4_E(B0, B1, B2, B3, K1);
140  SM4_E(B0, B1, B2, B3, K0);
141 
142  vst1q_u32(output32 , bqswap_32(B0));
143  vst1q_u32(output32+ 4, bqswap_32(B1));
144  vst1q_u32(output32+ 8, bqswap_32(B2));
145  vst1q_u32(output32+12, bqswap_32(B3));
146 
147  input32 += 4*4;
148  output32 += 4*4;
149  blocks -= 4;
150  }
151 
152  for(size_t i = 0; i != blocks; ++i)
153  {
154  uint32x4_t B = bswap_32(vld1q_u32(input32));
155 
156  B = vsm4eq_u32(B, K7);
157  B = vsm4eq_u32(B, K6);
158  B = vsm4eq_u32(B, K5);
159  B = vsm4eq_u32(B, K4);
160  B = vsm4eq_u32(B, K3);
161  B = vsm4eq_u32(B, K2);
162  B = vsm4eq_u32(B, K1);
163  B = vsm4eq_u32(B, K0);
164 
165  vst1q_u32(output32, bqswap_32(B));
166 
167  input32 += 4;
168  output32 += 4;
169  }
170  }
171 
172 #undef SM4_E
173 
174 }
void BOTAN_FUNC_ISA("arch=armv8.2-a+sm4") SM4 const uint32x4_t K1
Definition: sm4_armv8.cpp:54
const uint32x4_t K5
Definition: sm4_armv8.cpp:58
uint32_t * output32
Definition: sm4_armv8.cpp:63
const uint32_t * input32
Definition: sm4_armv8.cpp:62
const uint32x4_t K2
Definition: sm4_armv8.cpp:55
const uint32x4_t K4
Definition: sm4_armv8.cpp:57
SIMD_8x32 B
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:77
const uint32x4_t K3
Definition: sm4_armv8.cpp:56
Definition: alg_id.cpp:13
const uint32x4_t K6
Definition: sm4_armv8.cpp:59
#define SM4_E(B0, B1, B2, B3, K)
Definition: sm4_armv8.cpp:41
const uint32x4_t K7
Definition: sm4_armv8.cpp:60