1 /* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized sign operations.
3 * vi:ts=4 sw=4:
4 *
5 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7 * not use this file except in compliance with the License. You may obtain
8 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12 * or implied. See the License for the specific language governing
13 * permissions and limitations under the License.
14 */
15
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19
20 #include <freerdp/types.h>
21 #include <freerdp/primitives.h>
22 #include <winpr/sysinfo.h>
23
24 #ifdef WITH_SSE2
25 #include <emmintrin.h>
26 #include <tmmintrin.h>
27 #endif /* WITH_SSE2 */
28
29 #include "prim_internal.h"
30
31 static primitives_t* generic = NULL;
32
33 #ifdef WITH_SSE2
34 /* ------------------------------------------------------------------------- */
ssse3_sign_16s(const INT16 * pSrc,INT16 * pDst,UINT32 len)35 static pstatus_t ssse3_sign_16s(const INT16* pSrc, INT16* pDst, UINT32 len)
36 {
37 const INT16* sptr = (const INT16*)pSrc;
38 INT16* dptr = (INT16*)pDst;
39 size_t count;
40
41 if (len < 16)
42 {
43 return generic->sign_16s(pSrc, pDst, len);
44 }
45
46 /* Check for 16-byte alignment (eventually). */
47 if ((ULONG_PTR)pDst & 0x01)
48 {
49 return generic->sign_16s(pSrc, pDst, len);
50 }
51
52 /* Seek 16-byte alignment. */
53 while ((ULONG_PTR)dptr & 0x0f)
54 {
55 INT16 src = *sptr++;
56 *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
57
58 if (--len == 0)
59 return PRIMITIVES_SUCCESS;
60 }
61
62 /* Do 32-short chunks using 8 XMM registers. */
63 count = len >> 5; /* / 32 */
64 len -= count << 5; /* * 32 */
65
66 if ((ULONG_PTR)sptr & 0x0f)
67 {
68 /* Unaligned */
69 while (count--)
70 {
71 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
72 xmm0 = _mm_set1_epi16(0x0001U);
73 xmm1 = _mm_set1_epi16(0x0001U);
74 xmm2 = _mm_set1_epi16(0x0001U);
75 xmm3 = _mm_set1_epi16(0x0001U);
76 xmm4 = _mm_lddqu_si128((__m128i*)sptr);
77 sptr += 8;
78 xmm5 = _mm_lddqu_si128((__m128i*)sptr);
79 sptr += 8;
80 xmm6 = _mm_lddqu_si128((__m128i*)sptr);
81 sptr += 8;
82 xmm7 = _mm_lddqu_si128((__m128i*)sptr);
83 sptr += 8;
84 xmm0 = _mm_sign_epi16(xmm0, xmm4);
85 xmm1 = _mm_sign_epi16(xmm1, xmm5);
86 xmm2 = _mm_sign_epi16(xmm2, xmm6);
87 xmm3 = _mm_sign_epi16(xmm3, xmm7);
88 _mm_store_si128((__m128i*)dptr, xmm0);
89 dptr += 8;
90 _mm_store_si128((__m128i*)dptr, xmm1);
91 dptr += 8;
92 _mm_store_si128((__m128i*)dptr, xmm2);
93 dptr += 8;
94 _mm_store_si128((__m128i*)dptr, xmm3);
95 dptr += 8;
96 }
97 }
98 else
99 {
100 /* Aligned */
101 while (count--)
102 {
103 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
104 xmm0 = _mm_set1_epi16(0x0001U);
105 xmm1 = _mm_set1_epi16(0x0001U);
106 xmm2 = _mm_set1_epi16(0x0001U);
107 xmm3 = _mm_set1_epi16(0x0001U);
108 xmm4 = _mm_load_si128((__m128i*)sptr);
109 sptr += 8;
110 xmm5 = _mm_load_si128((__m128i*)sptr);
111 sptr += 8;
112 xmm6 = _mm_load_si128((__m128i*)sptr);
113 sptr += 8;
114 xmm7 = _mm_load_si128((__m128i*)sptr);
115 sptr += 8;
116 xmm0 = _mm_sign_epi16(xmm0, xmm4);
117 xmm1 = _mm_sign_epi16(xmm1, xmm5);
118 xmm2 = _mm_sign_epi16(xmm2, xmm6);
119 xmm3 = _mm_sign_epi16(xmm3, xmm7);
120 _mm_store_si128((__m128i*)dptr, xmm0);
121 dptr += 8;
122 _mm_store_si128((__m128i*)dptr, xmm1);
123 dptr += 8;
124 _mm_store_si128((__m128i*)dptr, xmm2);
125 dptr += 8;
126 _mm_store_si128((__m128i*)dptr, xmm3);
127 dptr += 8;
128 }
129 }
130
131 /* Do 8-short chunks using two XMM registers. */
132 count = len >> 3;
133 len -= count << 3;
134
135 while (count--)
136 {
137 __m128i xmm0 = _mm_set1_epi16(0x0001U);
138 __m128i xmm1 = LOAD_SI128(sptr);
139 sptr += 8;
140 xmm0 = _mm_sign_epi16(xmm0, xmm1);
141 _mm_store_si128((__m128i*)dptr, xmm0);
142 dptr += 8;
143 }
144
145 /* Do leftovers. */
146 while (len--)
147 {
148 INT16 src = *sptr++;
149 *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
150 }
151
152 return PRIMITIVES_SUCCESS;
153 }
154 #endif /* WITH_SSE2 */
155
156 /* ------------------------------------------------------------------------- */
primitives_init_sign_opt(primitives_t * prims)157 void primitives_init_sign_opt(primitives_t* prims)
158 {
159 generic = primitives_get_generic();
160 primitives_init_sign(prims);
161 /* Pick tuned versions if possible. */
162 /* I didn't spot an IPP version of this. */
163 #if defined(WITH_SSE2)
164
165 if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
166 IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
167 {
168 prims->sign_16s = ssse3_sign_16s;
169 }
170
171 #endif
172 }
173