Botan  2.13.0
Crypto and TLS for C++11
sm4_armv8.cpp
Go to the documentation of this file.
1 /*
2 * (C) 2018 Jack Lloyd
3 *
4 * Botan is released under the Simplified BSD License (see license.txt)
5 */
6 
7 #include <botan/sm4.h>
8 #include <arm_neon.h>
9 
10 namespace Botan {
11 
12 namespace {
13 
14 inline uint32x4_t qswap_32(uint32x4_t B)
15  {
16  static const uint8x16_t tbl = (uint8x16_t){12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
17  return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), tbl));
18  }
19 
20 inline uint32x4_t bswap_32(uint32x4_t B)
21  {
22  return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B)));
23  }
24 
25 /*
26  Swap both the quad-words and bytes within each word
27  equivalent to return bswap_32(qswap_32(B))
28 */
29 inline uint32x4_t bqswap_32(uint32x4_t B)
30  {
31  static const uint8x16_t tbl = (uint8x16_t){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
32  return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), tbl));
33  }
34 
35 #define SM4_E(B0, B1, B2, B3, K) do { \
36  B0 = vsm4eq_u32(B0, K); \
37  B1 = vsm4eq_u32(B1, K); \
38  B2 = vsm4eq_u32(B2, K); \
39  B3 = vsm4eq_u32(B3, K); \
40  } while(0)
41 
42 }
43 
44 BOTAN_FUNC_ISA("+sm4")
45 void SM4::sm4_armv8_encrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
46  {
47  const uint32x4_t K0 = vld1q_u32(&m_RK[ 0]);
48  const uint32x4_t K1 = vld1q_u32(&m_RK[ 4]);
49  const uint32x4_t K2 = vld1q_u32(&m_RK[ 8]);
50  const uint32x4_t K3 = vld1q_u32(&m_RK[12]);
51  const uint32x4_t K4 = vld1q_u32(&m_RK[16]);
52  const uint32x4_t K5 = vld1q_u32(&m_RK[20]);
53  const uint32x4_t K6 = vld1q_u32(&m_RK[24]);
54  const uint32x4_t K7 = vld1q_u32(&m_RK[28]);
55 
56  const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
57  uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
58 
59  while(blocks >= 4)
60  {
61  uint32x4_t B0 = bswap_32(vld1q_u32(input32));
62  uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
63  uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
64  uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
65 
66  SM4_E(B0, B1, B2, B3, K0);
67  SM4_E(B0, B1, B2, B3, K1);
68  SM4_E(B0, B1, B2, B3, K2);
69  SM4_E(B0, B1, B2, B3, K3);
70  SM4_E(B0, B1, B2, B3, K4);
71  SM4_E(B0, B1, B2, B3, K5);
72  SM4_E(B0, B1, B2, B3, K6);
73  SM4_E(B0, B1, B2, B3, K7);
74 
75  vst1q_u32(output32 , bqswap_32(B0));
76  vst1q_u32(output32+ 4, bqswap_32(B1));
77  vst1q_u32(output32+ 8, bqswap_32(B2));
78  vst1q_u32(output32+12, bqswap_32(B3));
79 
80  input32 += 4*4;
81  output32 += 4*4;
82  blocks -= 4;
83  }
84 
85  for(size_t i = 0; i != blocks; ++i)
86  {
87  uint32x4_t B = bswap_32(vld1q_u32(input32));
88 
89  B = vsm4eq_u32(B, K0);
90  B = vsm4eq_u32(B, K1);
91  B = vsm4eq_u32(B, K2);
92  B = vsm4eq_u32(B, K3);
93  B = vsm4eq_u32(B, K4);
94  B = vsm4eq_u32(B, K5);
95  B = vsm4eq_u32(B, K6);
96  B = vsm4eq_u32(B, K7);
97 
98  vst1q_u32(output32, bqswap_32(B));
99 
100  input32 += 4;
101  output32 += 4;
102  }
103  }
104 
105 BOTAN_FUNC_ISA("+sm4")
106 void SM4::sm4_armv8_decrypt(const uint8_t input8[], uint8_t output8[], size_t blocks) const
107  {
108  const uint32x4_t K0 = qswap_32(vld1q_u32(&m_RK[ 0]));
109  const uint32x4_t K1 = qswap_32(vld1q_u32(&m_RK[ 4]));
110  const uint32x4_t K2 = qswap_32(vld1q_u32(&m_RK[ 8]));
111  const uint32x4_t K3 = qswap_32(vld1q_u32(&m_RK[12]));
112  const uint32x4_t K4 = qswap_32(vld1q_u32(&m_RK[16]));
113  const uint32x4_t K5 = qswap_32(vld1q_u32(&m_RK[20]));
114  const uint32x4_t K6 = qswap_32(vld1q_u32(&m_RK[24]));
115  const uint32x4_t K7 = qswap_32(vld1q_u32(&m_RK[28]));
116 
117  const uint32_t* input32 = reinterpret_cast<const uint32_t*>(reinterpret_cast<const void*>(input8));
118  uint32_t* output32 = reinterpret_cast<uint32_t*>(reinterpret_cast<void*>(output8));
119 
120  while(blocks >= 4)
121  {
122  uint32x4_t B0 = bswap_32(vld1q_u32(input32));
123  uint32x4_t B1 = bswap_32(vld1q_u32(input32+4));
124  uint32x4_t B2 = bswap_32(vld1q_u32(input32+8));
125  uint32x4_t B3 = bswap_32(vld1q_u32(input32+12));
126 
127  SM4_E(B0, B1, B2, B3, K7);
128  SM4_E(B0, B1, B2, B3, K6);
129  SM4_E(B0, B1, B2, B3, K5);
130  SM4_E(B0, B1, B2, B3, K4);
131  SM4_E(B0, B1, B2, B3, K3);
132  SM4_E(B0, B1, B2, B3, K2);
133  SM4_E(B0, B1, B2, B3, K1);
134  SM4_E(B0, B1, B2, B3, K0);
135 
136  vst1q_u32(output32 , bqswap_32(B0));
137  vst1q_u32(output32+ 4, bqswap_32(B1));
138  vst1q_u32(output32+ 8, bqswap_32(B2));
139  vst1q_u32(output32+12, bqswap_32(B3));
140 
141  input32 += 4*4;
142  output32 += 4*4;
143  blocks -= 4;
144  }
145 
146  for(size_t i = 0; i != blocks; ++i)
147  {
148  uint32x4_t B = bswap_32(vld1q_u32(input32));
149 
150  B = vsm4eq_u32(B, K7);
151  B = vsm4eq_u32(B, K6);
152  B = vsm4eq_u32(B, K5);
153  B = vsm4eq_u32(B, K4);
154  B = vsm4eq_u32(B, K3);
155  B = vsm4eq_u32(B, K2);
156  B = vsm4eq_u32(B, K1);
157  B = vsm4eq_u32(B, K0);
158 
159  vst1q_u32(output32, bqswap_32(B));
160 
161  input32 += 4;
162  output32 += 4;
163  }
164  }
165 
166 #undef SM4_E
167 
168 }
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:71
Definition: alg_id.cpp:13
#define SM4_E(B0, B1, B2, B3, K)
Definition: sm4_armv8.cpp:35