1 // aria_simd.cpp - written and placed in the public domain by
2 //                 Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 //    This source file uses intrinsics to gain access to ARMv7a and
5 //    ARMv8a NEON instructions. A separate source file is needed
6 //    because additional CXXFLAGS are required to enable the
7 //    appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "misc.h"
12 
13 #if (CRYPTOPP_SSSE3_AVAILABLE)
14 # include <tmmintrin.h>
15 #endif
16 
17 #if (CRYPTOPP_ARM_NEON_HEADER)
18 # include <arm_neon.h>
19 #endif
20 
21 #if (CRYPTOPP_ARM_ACLE_HEADER)
22 # include <stdint.h>
23 # include <arm_acle.h>
24 #endif
25 
26 // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
27 #define M128_CAST(x) ((__m128i *)(void *)(x))
28 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
29 
30 // Squash MS LNK4221 and libtool warnings
31 extern const char ARIA_SIMD_FNAME[] = __FILE__;
32 
33 NAMESPACE_BEGIN(CryptoPP)
34 NAMESPACE_BEGIN(ARIATab)
35 
36 extern const word32 S1[256];
37 extern const word32 S2[256];
38 extern const word32 X1[256];
39 extern const word32 X2[256];
40 extern const word32 KRK[3][4];
41 
42 NAMESPACE_END
43 NAMESPACE_END
44 
45 ANONYMOUS_NAMESPACE_BEGIN
46 
47 using CryptoPP::byte;
48 using CryptoPP::word32;
49 
ARIA_BRF(const word32 x,const int y)50 inline byte ARIA_BRF(const word32 x, const int y) {
51 	return static_cast<byte>(GETBYTE(x, y));
52 }
53 
54 ANONYMOUS_NAMESPACE_END
55 
56 NAMESPACE_BEGIN(CryptoPP)
57 
58 using CryptoPP::ARIATab::S1;
59 using CryptoPP::ARIATab::S2;
60 using CryptoPP::ARIATab::X1;
61 using CryptoPP::ARIATab::X2;
62 using CryptoPP::ARIATab::KRK;
63 
64 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
65 
66 template <unsigned int N>
ARIA_GSRK_NEON(const uint32x4_t X,const uint32x4_t Y,byte RK[16])67 inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16])
68 {
69 	enum { Q1 = (4-(N/32)) % 4,
70 	       Q2 = (3-(N/32)) % 4,
71 	       R = N % 32
72 	};
73 
74 	vst1q_u8(RK, vreinterpretq_u8_u32(
75 		veorq_u32(X, veorq_u32(
76 			vshrq_n_u32(vextq_u32(Y, Y, Q1), R),
77 			vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))));
78 }
79 
ARIA_UncheckedSetKey_Schedule_NEON(byte * rk,word32 * ws,unsigned int keylen)80 void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen)
81 {
82 	const uint32x4_t w0 = vld1q_u32(ws+ 0);
83 	const uint32x4_t w1 = vld1q_u32(ws+ 8);
84 	const uint32x4_t w2 = vld1q_u32(ws+12);
85 	const uint32x4_t w3 = vld1q_u32(ws+16);
86 
87 	ARIA_GSRK_NEON<19>(w0, w1, rk +   0);
88 	ARIA_GSRK_NEON<19>(w1, w2, rk +  16);
89 	ARIA_GSRK_NEON<19>(w2, w3, rk +  32);
90 	ARIA_GSRK_NEON<19>(w3, w0, rk +  48);
91 	ARIA_GSRK_NEON<31>(w0, w1, rk +  64);
92 	ARIA_GSRK_NEON<31>(w1, w2, rk +  80);
93 	ARIA_GSRK_NEON<31>(w2, w3, rk +  96);
94 	ARIA_GSRK_NEON<31>(w3, w0, rk + 112);
95 	ARIA_GSRK_NEON<67>(w0, w1, rk + 128);
96 	ARIA_GSRK_NEON<67>(w1, w2, rk + 144);
97 	ARIA_GSRK_NEON<67>(w2, w3, rk + 160);
98 	ARIA_GSRK_NEON<67>(w3, w0, rk + 176);
99 	ARIA_GSRK_NEON<97>(w0, w1, rk + 192);
100 
101 	if (keylen > 16)
102 	{
103 		ARIA_GSRK_NEON<97>(w1, w2, rk + 208);
104 		ARIA_GSRK_NEON<97>(w2, w3, rk + 224);
105 
106 		if (keylen > 24)
107 		{
108 			ARIA_GSRK_NEON< 97>(w3, w0, rk + 240);
109 			ARIA_GSRK_NEON<109>(w0, w1, rk + 256);
110 		}
111 	}
112 }
113 
ARIA_ProcessAndXorBlock_NEON(const byte * xorBlock,byte * outBlock,const byte * rk,word32 * t)114 void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
115 {
116 	outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)]   );
117 	outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
118 	outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)]   );
119 	outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)]   );
120 	outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)]   );
121 	outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
122 	outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)]   );
123 	outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)]   );
124 	outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)]   );
125 	outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
126 	outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)]   );
127 	outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)]   );
128 	outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)]   );
129 	outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
130 	outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)]   );
131 	outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)]   );
132 
133 	// 'outBlock' and 'xorBlock' may be unaligned.
134 	if (xorBlock != NULLPTR)
135 	{
136 		vst1q_u8(outBlock,
137 			veorq_u8(
138 				vld1q_u8(xorBlock),
139 				veorq_u8(
140 					vld1q_u8(outBlock),
141 					vrev32q_u8(vld1q_u8((rk))))));
142 	}
143 	else
144 	{
145 		vst1q_u8(outBlock,
146 			veorq_u8(
147 				vld1q_u8(outBlock),
148 				vrev32q_u8(vld1q_u8(rk))));
149 	}
150 }
151 
152 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
153 
154 #if (CRYPTOPP_SSSE3_AVAILABLE)
155 
ARIA_ProcessAndXorBlock_SSSE3(const byte * xorBlock,byte * outBlock,const byte * rk,word32 * t)156 void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t)
157 {
158 	const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
159 
160 	outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)]   );
161 	outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8);
162 	outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)]   );
163 	outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)]   );
164 	outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)]   );
165 	outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8);
166 	outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)]   );
167 	outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)]   );
168 	outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)]   );
169 	outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8);
170 	outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)]   );
171 	outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)]   );
172 	outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)]   );
173 	outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8);
174 	outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)]   );
175 	outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)]   );
176 
177 	// 'outBlock' and 'xorBlock' may be unaligned.
178 	if (xorBlock != NULLPTR)
179 	{
180 		_mm_storeu_si128(M128_CAST(outBlock),
181 			_mm_xor_si128(
182 				_mm_loadu_si128(CONST_M128_CAST(xorBlock)),
183 				_mm_xor_si128(
184 					_mm_loadu_si128(CONST_M128_CAST(outBlock)),
185 					_mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)))
186 			);
187 	}
188 	else
189 	{
190 		_mm_storeu_si128(M128_CAST(outBlock),
191 			_mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)),
192 				_mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK)));
193 	}
194 }
195 
196 #endif  // CRYPTOPP_SSSE3_AVAILABLE
197 
198 NAMESPACE_END
199