1 /*  This file is part of the Vc library. {{{
2 Copyright © 2014-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_COMMON_SCATTERIMPLEMENTATION_H_
29 #define VC_COMMON_SCATTERIMPLEMENTATION_H_
30 
31 #include "gatherimplementation.h"
32 #include "macros.h"
33 
34 namespace Vc_VERSIONED_NAMESPACE
35 {
36 namespace Common
37 {
38 
39 template <typename V, typename MT, typename IT>
executeScatter(SetIndexZeroT,V & v,MT * mem,IT indexes,typename V::MaskArgument mask)40 Vc_ALWAYS_INLINE void executeScatter(SetIndexZeroT,
41                                     V &v,
42                                     MT *mem,
43                                     IT indexes,
44                                     typename V::MaskArgument mask)
45 {
46     indexes.setZeroInverted(static_cast<typename IT::Mask>(mask));
47     // Huh?
48     const V tmp(mem, indexes);
49     where(mask) | v = tmp;
50 }
51 
52 template <typename V, typename MT, typename IT>
executeScatter(SimpleLoopT,V & v,MT * mem,const IT & indexes,typename V::MaskArgument mask)53 Vc_ALWAYS_INLINE void executeScatter(SimpleLoopT,
54                                     V &v,
55                                     MT *mem,
56                                     const IT &indexes,
57                                     typename V::MaskArgument mask)
58 {
59     if (Vc_IS_UNLIKELY(mask.isEmpty())) {
60         return;
61     }
62     Common::unrolled_loop<std::size_t, 0, V::Size>([&](std::size_t i) {
63         if (mask[i])
64             mem[indexes[i]] = v[i];
65     });
66 }
67 
68 template <typename V, typename MT, typename IT>
executeScatter(BitScanLoopT,V & v,MT * mem,const IT & indexes,typename V::MaskArgument mask)69 Vc_ALWAYS_INLINE void executeScatter(BitScanLoopT,
70                                     V &v,
71                                     MT *mem,
72                                     const IT &indexes,
73                                     typename V::MaskArgument mask)
74 {
75     size_t bits = mask.toInt();
76     while (Vc_IS_LIKELY(bits > 0)) {
77         size_t i, j;
78         asm("bsf %[bits],%[i]\n\t"
79             "bsr %[bits],%[j]\n\t"
80             "btr %[i],%[bits]\n\t"
81             "btr %[j],%[bits]\n\t"
82             : [i] "=r"(i), [j] "=r"(j), [bits] "+r"(bits));
83         mem[indexes[i]] = v[i];
84         mem[indexes[j]] = v[j];
85     }
86 
87     /* Alternative from Vc::SSE (0.7)
88     int bits = mask.toInt();
89     while (bits) {
90         const int i = _bit_scan_forward(bits);
91         bits ^= (1 << i); // btr?
92         mem[indexes[i]] = v[i];
93     }
94     */
95 }
96 
97 template <typename V, typename MT, typename IT>
98 Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
99                                     V &v,
100                                     MT *mem,
101                                     const IT &indexes,
102                                     typename V::MaskArgument mask,
103                                     enable_if<V::Size == 16> = nullarg)
104 {
105     unsigned int bits = mask.toInt();
106     unsigned int low, high = 0;
107     switch (Vc::Detail::popcnt16(bits)) {
108     case 16:
109         v.scatter(mem, indexes);
110         break;
111     case 15:
112         low = _bit_scan_forward(bits);
113         bits ^= 1 << low;
114         mem[indexes[low]] = v[low];
115     case 14:
116         high = _bit_scan_reverse(bits);
117         mem[indexes[high]] = v[high];
118         high = (1 << high);
119     case 13:
120         low = _bit_scan_forward(bits);
121         bits ^= high | (1 << low);
122         mem[indexes[low]] = v[low];
123     case 12:
124         high = _bit_scan_reverse(bits);
125         mem[indexes[high]] = v[high];
126         high = (1 << high);
127     case 11:
128         low = _bit_scan_forward(bits);
129         bits ^= high | (1 << low);
130         mem[indexes[low]] = v[low];
131     case 10:
132         high = _bit_scan_reverse(bits);
133         mem[indexes[high]] = v[high];
134         high = (1 << high);
135     case 9:
136         low = _bit_scan_forward(bits);
137         bits ^= high | (1 << low);
138         mem[indexes[low]] = v[low];
139     case 8:
140         high = _bit_scan_reverse(bits);
141         mem[indexes[high]] = v[high];
142         high = (1 << high);
143     case 7:
144         low = _bit_scan_forward(bits);
145         bits ^= high | (1 << low);
146         mem[indexes[low]] = v[low];
147     case 6:
148         high = _bit_scan_reverse(bits);
149         mem[indexes[high]] = v[high];
150         high = (1 << high);
151     case 5:
152         low = _bit_scan_forward(bits);
153         bits ^= high | (1 << low);
154         mem[indexes[low]] = v[low];
155     case 4:
156         high = _bit_scan_reverse(bits);
157         mem[indexes[high]] = v[high];
158         high = (1 << high);
159     case 3:
160         low = _bit_scan_forward(bits);
161         bits ^= high | (1 << low);
162         mem[indexes[low]] = v[low];
163     case 2:
164         high = _bit_scan_reverse(bits);
165         mem[indexes[high]] = v[high];
166     case 1:
167         low = _bit_scan_forward(bits);
168         mem[indexes[low]] = v[low];
169     case 0:
170         break;
171     }
172 }
173 template <typename V, typename MT, typename IT>
174 Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
175                                     V &v,
176                                     MT *mem,
177                                     const IT &indexes,
178                                     typename V::MaskArgument mask,
179                                     enable_if<V::Size == 8> = nullarg)
180 {
181     unsigned int bits = mask.toInt();
182     unsigned int low, high = 0;
183     switch (Vc::Detail::popcnt8(bits)) {
184     case 8:
185         v.scatter(mem, indexes);
186         break;
187     case 7:
188         low = _bit_scan_forward(bits);
189         bits ^= 1 << low;
190         mem[indexes[low]] = v[low];
191     case 6:
192         high = _bit_scan_reverse(bits);
193         mem[indexes[high]] = v[high];
194         high = (1 << high);
195     case 5:
196         low = _bit_scan_forward(bits);
197         bits ^= high | (1 << low);
198         mem[indexes[low]] = v[low];
199     case 4:
200         high = _bit_scan_reverse(bits);
201         mem[indexes[high]] = v[high];
202         high = (1 << high);
203     case 3:
204         low = _bit_scan_forward(bits);
205         bits ^= high | (1 << low);
206         mem[indexes[low]] = v[low];
207     case 2:
208         high = _bit_scan_reverse(bits);
209         mem[indexes[high]] = v[high];
210     case 1:
211         low = _bit_scan_forward(bits);
212         mem[indexes[low]] = v[low];
213     case 0:
214         break;
215     }
216 }
217 template <typename V, typename MT, typename IT>
218 Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
219                                     V &v,
220                                     MT *mem,
221                                     const IT &indexes,
222                                     typename V::MaskArgument mask,
223                                     enable_if<V::Size == 4> = nullarg)
224 {
225     unsigned int bits = mask.toInt();
226     unsigned int low, high = 0;
227     switch (Vc::Detail::popcnt4(bits)) {
228     case 4:
229         v.scatter(mem, indexes);
230         break;
231     case 3:
232         low = _bit_scan_forward(bits);
233         bits ^= 1 << low;
234         mem[indexes[low]] = v[low];
235     case 2:
236         high = _bit_scan_reverse(bits);
237         mem[indexes[high]] = v[high];
238     case 1:
239         low = _bit_scan_forward(bits);
240         mem[indexes[low]] = v[low];
241     case 0:
242         break;
243     }
244 }
245 template <typename V, typename MT, typename IT>
246 Vc_ALWAYS_INLINE void executeScatter(PopcntSwitchT,
247                                     V &v,
248                                     MT *mem,
249                                     const IT &indexes,
250                                     typename V::MaskArgument mask,
251                                     enable_if<V::Size == 2> = nullarg)
252 {
253     unsigned int bits = mask.toInt();
254     unsigned int low;
255     switch (Vc::Detail::popcnt4(bits)) {
256     case 2:
257         v.scatter(mem, indexes);
258         break;
259     case 1:
260         low = _bit_scan_forward(bits);
261         mem[indexes[low]] = v[low];
262     case 0:
263         break;
264     }
265 }
266 
267 }  // namespace Common
268 }  // namespace Vc
269 
270 #endif // VC_COMMON_SCATTERIMPLEMENTATION_H_
271