10e33efe4SConrad Meyer /*
20e33efe4SConrad Meyer    BLAKE2 reference source code package - optimized C implementations
30e33efe4SConrad Meyer 
40e33efe4SConrad Meyer    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
50e33efe4SConrad Meyer 
60e33efe4SConrad Meyer    To the extent possible under law, the author(s) have dedicated all copyright
70e33efe4SConrad Meyer    and related and neighboring rights to this software to the public domain
80e33efe4SConrad Meyer    worldwide. This software is distributed without any warranty.
90e33efe4SConrad Meyer 
100e33efe4SConrad Meyer    You should have received a copy of the CC0 Public Domain Dedication along with
110e33efe4SConrad Meyer    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
120e33efe4SConrad Meyer */
130e33efe4SConrad Meyer #pragma once
140e33efe4SConrad Meyer #ifndef __BLAKE2S_LOAD_SSE41_H__
150e33efe4SConrad Meyer #define __BLAKE2S_LOAD_SSE41_H__
160e33efe4SConrad Meyer 
170e33efe4SConrad Meyer #define LOAD_MSG_0_1(buf) \
180e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
190e33efe4SConrad Meyer 
200e33efe4SConrad Meyer #define LOAD_MSG_0_2(buf) \
210e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
220e33efe4SConrad Meyer 
230e33efe4SConrad Meyer #define LOAD_MSG_0_3(buf) \
240e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
250e33efe4SConrad Meyer 
260e33efe4SConrad Meyer #define LOAD_MSG_0_4(buf) \
270e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
280e33efe4SConrad Meyer 
290e33efe4SConrad Meyer #define LOAD_MSG_1_1(buf) \
300e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1, m2, 0x0C); \
310e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \
320e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \
330e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
340e33efe4SConrad Meyer 
350e33efe4SConrad Meyer #define LOAD_MSG_1_2(buf) \
360e33efe4SConrad Meyer t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
370e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m3,0xC0); \
380e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \
390e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
400e33efe4SConrad Meyer 
410e33efe4SConrad Meyer #define LOAD_MSG_1_3(buf) \
420e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 4); \
430e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2, t0, 0x30); \
440e33efe4SConrad Meyer t2 = _mm_blend_epi16(m0, t1, 0xF0); \
450e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
460e33efe4SConrad Meyer 
470e33efe4SConrad Meyer #define LOAD_MSG_1_4(buf) \
480e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
490e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \
500e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0C); \
510e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
520e33efe4SConrad Meyer 
530e33efe4SConrad Meyer #define LOAD_MSG_2_1(buf) \
540e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m2,m3); \
550e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m1,0x0C); \
560e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0F); \
570e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
580e33efe4SConrad Meyer 
590e33efe4SConrad Meyer #define LOAD_MSG_2_2(buf) \
600e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m2,m0); \
610e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m0, 0xF0); \
620e33efe4SConrad Meyer t2 = _mm_slli_si128(m3, 8); \
630e33efe4SConrad Meyer buf = _mm_blend_epi16(t1, t2, 0xC0);
640e33efe4SConrad Meyer 
650e33efe4SConrad Meyer #define LOAD_MSG_2_3(buf) \
660e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0, m2, 0x3C); \
670e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 12); \
680e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
690e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
700e33efe4SConrad Meyer 
710e33efe4SConrad Meyer #define LOAD_MSG_2_4(buf) \
720e33efe4SConrad Meyer t0 = _mm_slli_si128(m3, 4); \
730e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0, m1, 0x33); \
740e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \
750e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
760e33efe4SConrad Meyer 
770e33efe4SConrad Meyer #define LOAD_MSG_3_1(buf) \
780e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
790e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(t0, m2); \
800e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, m3, 0x0C); \
810e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
820e33efe4SConrad Meyer 
830e33efe4SConrad Meyer #define LOAD_MSG_3_2(buf) \
840e33efe4SConrad Meyer t0 = _mm_slli_si128(m2, 8); \
850e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m0,0x0C); \
860e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \
870e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
880e33efe4SConrad Meyer 
890e33efe4SConrad Meyer #define LOAD_MSG_3_3(buf) \
900e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m1,0x0F); \
910e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m3, 0xC0); \
920e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
930e33efe4SConrad Meyer 
940e33efe4SConrad Meyer #define LOAD_MSG_3_4(buf) \
950e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m2); \
960e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \
970e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t1,t0);
980e33efe4SConrad Meyer 
990e33efe4SConrad Meyer #define LOAD_MSG_4_1(buf) \
1000e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \
1010e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,m2); \
1020e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x33); \
1030e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
1040e33efe4SConrad Meyer 
1050e33efe4SConrad Meyer #define LOAD_MSG_4_2(buf) \
1060e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m1,m3); \
1070e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m0,m1); \
1080e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33);
1090e33efe4SConrad Meyer 
1100e33efe4SConrad Meyer #define LOAD_MSG_4_3(buf) \
1110e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m3,m1); \
1120e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m2,m0); \
1130e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x33);
1140e33efe4SConrad Meyer 
1150e33efe4SConrad Meyer #define LOAD_MSG_4_4(buf) \
1160e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \
1170e33efe4SConrad Meyer t1 = _mm_slli_si128(t0, 8); \
1180e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m3,0x0F); \
1190e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
1200e33efe4SConrad Meyer 
1210e33efe4SConrad Meyer #define LOAD_MSG_5_1(buf) \
1220e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
1230e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m2); \
1240e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1);
1250e33efe4SConrad Meyer 
1260e33efe4SConrad Meyer #define LOAD_MSG_5_2(buf) \
1270e33efe4SConrad Meyer t0 = _mm_srli_si128(m2, 4); \
1280e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x03); \
1290e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x3C);
1300e33efe4SConrad Meyer 
1310e33efe4SConrad Meyer #define LOAD_MSG_5_3(buf) \
1320e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x0C); \
1330e33efe4SConrad Meyer t1 = _mm_srli_si128(m3, 4); \
1340e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x30); \
1350e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
1360e33efe4SConrad Meyer 
1370e33efe4SConrad Meyer #define LOAD_MSG_5_4(buf) \
1380e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \
1390e33efe4SConrad Meyer t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
1400e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33);
1410e33efe4SConrad Meyer 
1420e33efe4SConrad Meyer #define LOAD_MSG_6_1(buf) \
1430e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 12); \
1440e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x33); \
1450e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0xC0);
1460e33efe4SConrad Meyer 
1470e33efe4SConrad Meyer #define LOAD_MSG_6_2(buf) \
1480e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0x30); \
1490e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \
1500e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
1510e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
1520e33efe4SConrad Meyer 
1530e33efe4SConrad Meyer #define LOAD_MSG_6_3(buf) \
1540e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m0,m2); \
1550e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \
1560e33efe4SConrad Meyer buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
1570e33efe4SConrad Meyer 
1580e33efe4SConrad Meyer #define LOAD_MSG_6_4(buf) \
1590e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m2); \
1600e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,t0); \
1610e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
1620e33efe4SConrad Meyer 
1630e33efe4SConrad Meyer #define LOAD_MSG_7_1(buf) \
1640e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
1650e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0,m3,0x0F); \
1660e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
1670e33efe4SConrad Meyer 
1680e33efe4SConrad Meyer #define LOAD_MSG_7_2(buf) \
1690e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m3,0x30); \
1700e33efe4SConrad Meyer t1 = _mm_srli_si128(m0,4); \
1710e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
1720e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
1730e33efe4SConrad Meyer 
1740e33efe4SConrad Meyer #define LOAD_MSG_7_3(buf) \
1750e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m0,m3); \
1760e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m1,m2); \
1770e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x3C); \
1780e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
1790e33efe4SConrad Meyer 
1800e33efe4SConrad Meyer #define LOAD_MSG_7_4(buf) \
1810e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m1); \
1820e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \
1830e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1);
1840e33efe4SConrad Meyer 
1850e33efe4SConrad Meyer #define LOAD_MSG_8_1(buf) \
1860e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m3); \
1870e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(t0,m0); \
1880e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m2,0xC0); \
1890e33efe4SConrad Meyer buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
1900e33efe4SConrad Meyer 
1910e33efe4SConrad Meyer #define LOAD_MSG_8_2(buf) \
1920e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \
1930e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2,t0,0xF0); \
1940e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
1950e33efe4SConrad Meyer 
1960e33efe4SConrad Meyer #define LOAD_MSG_8_3(buf) \
1970e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m0,0x0C); \
1980e33efe4SConrad Meyer t1 = _mm_slli_si128(t0,4); \
1990e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,m3,0x0F);
2000e33efe4SConrad Meyer 
2010e33efe4SConrad Meyer #define LOAD_MSG_8_4(buf) \
2020e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x30); \
2030e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2040e33efe4SConrad Meyer 
2050e33efe4SConrad Meyer #define LOAD_MSG_9_1(buf) \
2060e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \
2070e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m2,0x30); \
2080e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,t0,0x0F); \
2090e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2100e33efe4SConrad Meyer 
2110e33efe4SConrad Meyer #define LOAD_MSG_9_2(buf) \
2120e33efe4SConrad Meyer t0 = _mm_slli_si128(m0,4); \
2130e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,t0,0xC0); \
2140e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2150e33efe4SConrad Meyer 
2160e33efe4SConrad Meyer #define LOAD_MSG_9_3(buf) \
2170e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \
2180e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m2,m3); \
2190e33efe4SConrad Meyer t2 = _mm_unpackhi_epi64(t0,t1); \
2200e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2210e33efe4SConrad Meyer 
2220e33efe4SConrad Meyer #define LOAD_MSG_9_4(buf) \
2230e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0xC0); \
2240e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m3); \
2250e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x0F); \
2260e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2270e33efe4SConrad Meyer 
2280e33efe4SConrad Meyer #endif
2290e33efe4SConrad Meyer 
230