1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized sign operations.
3  * vi:ts=4 sw=4:
4  *
5  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6  * Licensed under the Apache License, Version 2.0 (the "License"); you may
7  * not use this file except in compliance with the License. You may obtain
8  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12  * or implied. See the License for the specific language governing
13  * permissions and limitations under the License.
14  */
15 
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include <freerdp/types.h>
21 #include <freerdp/primitives.h>
22 #include <winpr/sysinfo.h>
23 
24 #ifdef WITH_SSE2
25 #include <emmintrin.h>
26 #include <tmmintrin.h>
27 #endif /* WITH_SSE2 */
28 
29 #include "prim_internal.h"
30 
31 static primitives_t* generic = NULL;
32 
33 #ifdef WITH_SSE2
34 /* ------------------------------------------------------------------------- */
ssse3_sign_16s(const INT16 * pSrc,INT16 * pDst,UINT32 len)35 static pstatus_t ssse3_sign_16s(const INT16* pSrc, INT16* pDst, UINT32 len)
36 {
37 	const INT16* sptr = (const INT16*)pSrc;
38 	INT16* dptr = (INT16*)pDst;
39 	size_t count;
40 
41 	if (len < 16)
42 	{
43 		return generic->sign_16s(pSrc, pDst, len);
44 	}
45 
46 	/* Check for 16-byte alignment (eventually). */
47 	if ((ULONG_PTR)pDst & 0x01)
48 	{
49 		return generic->sign_16s(pSrc, pDst, len);
50 	}
51 
52 	/* Seek 16-byte alignment. */
53 	while ((ULONG_PTR)dptr & 0x0f)
54 	{
55 		INT16 src = *sptr++;
56 		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
57 
58 		if (--len == 0)
59 			return PRIMITIVES_SUCCESS;
60 	}
61 
62 	/* Do 32-short chunks using 8 XMM registers. */
63 	count = len >> 5;  /* / 32  */
64 	len -= count << 5; /* * 32 */
65 
66 	if ((ULONG_PTR)sptr & 0x0f)
67 	{
68 		/* Unaligned */
69 		while (count--)
70 		{
71 			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
72 			xmm0 = _mm_set1_epi16(0x0001U);
73 			xmm1 = _mm_set1_epi16(0x0001U);
74 			xmm2 = _mm_set1_epi16(0x0001U);
75 			xmm3 = _mm_set1_epi16(0x0001U);
76 			xmm4 = _mm_lddqu_si128((__m128i*)sptr);
77 			sptr += 8;
78 			xmm5 = _mm_lddqu_si128((__m128i*)sptr);
79 			sptr += 8;
80 			xmm6 = _mm_lddqu_si128((__m128i*)sptr);
81 			sptr += 8;
82 			xmm7 = _mm_lddqu_si128((__m128i*)sptr);
83 			sptr += 8;
84 			xmm0 = _mm_sign_epi16(xmm0, xmm4);
85 			xmm1 = _mm_sign_epi16(xmm1, xmm5);
86 			xmm2 = _mm_sign_epi16(xmm2, xmm6);
87 			xmm3 = _mm_sign_epi16(xmm3, xmm7);
88 			_mm_store_si128((__m128i*)dptr, xmm0);
89 			dptr += 8;
90 			_mm_store_si128((__m128i*)dptr, xmm1);
91 			dptr += 8;
92 			_mm_store_si128((__m128i*)dptr, xmm2);
93 			dptr += 8;
94 			_mm_store_si128((__m128i*)dptr, xmm3);
95 			dptr += 8;
96 		}
97 	}
98 	else
99 	{
100 		/* Aligned */
101 		while (count--)
102 		{
103 			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
104 			xmm0 = _mm_set1_epi16(0x0001U);
105 			xmm1 = _mm_set1_epi16(0x0001U);
106 			xmm2 = _mm_set1_epi16(0x0001U);
107 			xmm3 = _mm_set1_epi16(0x0001U);
108 			xmm4 = _mm_load_si128((__m128i*)sptr);
109 			sptr += 8;
110 			xmm5 = _mm_load_si128((__m128i*)sptr);
111 			sptr += 8;
112 			xmm6 = _mm_load_si128((__m128i*)sptr);
113 			sptr += 8;
114 			xmm7 = _mm_load_si128((__m128i*)sptr);
115 			sptr += 8;
116 			xmm0 = _mm_sign_epi16(xmm0, xmm4);
117 			xmm1 = _mm_sign_epi16(xmm1, xmm5);
118 			xmm2 = _mm_sign_epi16(xmm2, xmm6);
119 			xmm3 = _mm_sign_epi16(xmm3, xmm7);
120 			_mm_store_si128((__m128i*)dptr, xmm0);
121 			dptr += 8;
122 			_mm_store_si128((__m128i*)dptr, xmm1);
123 			dptr += 8;
124 			_mm_store_si128((__m128i*)dptr, xmm2);
125 			dptr += 8;
126 			_mm_store_si128((__m128i*)dptr, xmm3);
127 			dptr += 8;
128 		}
129 	}
130 
131 	/* Do 8-short chunks using two XMM registers. */
132 	count = len >> 3;
133 	len -= count << 3;
134 
135 	while (count--)
136 	{
137 		__m128i xmm0 = _mm_set1_epi16(0x0001U);
138 		__m128i xmm1 = LOAD_SI128(sptr);
139 		sptr += 8;
140 		xmm0 = _mm_sign_epi16(xmm0, xmm1);
141 		_mm_store_si128((__m128i*)dptr, xmm0);
142 		dptr += 8;
143 	}
144 
145 	/* Do leftovers. */
146 	while (len--)
147 	{
148 		INT16 src = *sptr++;
149 		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
150 	}
151 
152 	return PRIMITIVES_SUCCESS;
153 }
154 #endif /* WITH_SSE2 */
155 
156 /* ------------------------------------------------------------------------- */
primitives_init_sign_opt(primitives_t * prims)157 void primitives_init_sign_opt(primitives_t* prims)
158 {
159 	generic = primitives_get_generic();
160 	primitives_init_sign(prims);
161 	/* Pick tuned versions if possible. */
162 	/* I didn't spot an IPP version of this. */
163 #if defined(WITH_SSE2)
164 
165 	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
166 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
167 	{
168 		prims->sign_16s = ssse3_sign_16s;
169 	}
170 
171 #endif
172 }
173