1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 /*!
21 * \file packet-inl.h
22 * \brief Generic packet vectorization code
23 */
24 #ifndef MSHADOW_PACKET_INL_H_
25 #define MSHADOW_PACKET_INL_H_
26
27 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__)
28 #include <stdlib.h>
29 #else
30 #include <malloc.h>
31 #endif
32 #include "./base.h"
33 #include "./tensor.h"
34 #include "./expression.h"
35
36
37 namespace mshadow {
38 /*! \brief namespace of packet math*/
39 namespace packet {
40
41 enum PacketArch {
42 kPlain,
43 kSSE2,
44 };
45
46 #if MSHADOW_USE_SSE
47 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kSSE2
48 #else
49 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kPlain
50 #endif
51
52 // whether packet operator is enabled.
53 /*!
54 * \brief Generic packet type
55 * \tparam DType The data type of the packet.
56 * \tparam Arch the Arch of the packet.
57 */
58 template<typename DType, PacketArch Arch = MSHADOW_DEFAULT_PACKET>
59 struct Packet;
60
61 template<PacketArch Arch>
62 struct AlignBytes {
63 static const index_t value = 4;
64 };
65
66 } // namespace packet
67 } // namespace mshadow
68
69 namespace mshadow {
70 namespace packet {
71 /*!
72 * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
73 * \param out_pitch output parameter, the actuall space allocated for each line
74 * \param lspace number of cells required for each line
75 * \param num_line number of lines to be allocated
76 */
AlignedMallocPitch(size_t * out_pitch,size_t lspace,size_t num_line)77 inline void* AlignedMallocPitch(size_t *out_pitch,
78 size_t lspace,
79 size_t num_line) {
80 const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
81 const index_t mask = (1 << bits) - 1;
82
83 size_t pitch = ((lspace + mask) >> bits) << bits;
84 *out_pitch = pitch;
85 #ifdef _MSC_VER
86 void *res = _aligned_malloc(pitch * num_line, 1 << bits);
87 #else
88 void *res;
89 int ret = posix_memalign(&res, 1 << bits, pitch * num_line);
90 CHECK_EQ(ret, 0) << "AlignedMallocPitch failed";
91 #endif
92 if (res == NULL) {
93 LOG(FATAL) << "AlignedMallocPitch failed";
94 }
95 #if __GNUC__ >= 6
96 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
97 #endif
98 return res;
99 #pragma GCC diagnostic pop
100 }
101
102 /*!
103 * \brief free aligned space
104 * \param ptr pointer to space to be freed
105 */
AlignedFree(void * ptr)106 inline void AlignedFree(void *ptr) {
107 #ifdef _MSC_VER
108 _aligned_free(ptr);
109 #else
110 free(ptr);
111 #endif
112 }
113
114 /*! \brief check if a pointer is aligned */
115 template<PacketArch Arch>
CheckAlign(size_t pitch)116 inline bool CheckAlign(size_t pitch) {
117 const index_t bits = AlignBytes<Arch>::value;
118 return !(pitch & ((1 << bits) - 1));
119 }
120
121 /*! \brief check if a pointer is aligned */
122 template<PacketArch Arch>
CheckAlign(void * ptr)123 inline bool CheckAlign(void *ptr) {
124 return CheckAlign<Arch>(reinterpret_cast<size_t>(ptr));
125 }
126
127 /*!
128 * \brief get upper bound of aligned index of size
129 * \param size size of the array
130 * \param fsize size of float
131 */
132 template<typename DType, PacketArch Arch>
UpperAlign(index_t size)133 inline index_t UpperAlign(index_t size) {
134 const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
135 const index_t mask = (1 << bits) - 1;
136 const index_t fsize = sizeof(DType);
137 return (((size * fsize + mask) >> bits) << bits) / fsize;
138 }
139
140 /*!
141 * \brief get lower bound of aligned index of size
142 * \param size size of the array
143 * \param fsize size of float
144 */
145 template<typename DType, PacketArch Arch>
LowerAlign(index_t size)146 inline index_t LowerAlign(index_t size) {
147 const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
148 const index_t fsize = sizeof(DType);
149 return (((size * fsize) >> bits) << bits) / fsize;
150 }
151
152 /*!
153 * \brief generic Packet operator
154 * \tparam OP The operator
155 * \tparam DType The data type
156 * \tparam Arch The architecture.
157 */
158 template<typename OP, typename DType, PacketArch Arch>
159 struct PacketOp {
160 static const bool kEnabled = false;
161 };
162 // specialization of operators
163 template<typename DType, PacketArch Arch>
164 struct PacketOp<op::plus, DType, Arch> {
165 static const bool kEnabled = true;
166 MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
167 const Packet<DType, Arch>& rhs) {
168 return lhs + rhs;
169 }
170 };
171 template<typename DType, PacketArch Arch>
172 struct PacketOp<op::minus, DType, Arch> {
173 static const bool kEnabled = true;
174 MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
175 const Packet<DType, Arch>& rhs) {
176 return lhs - rhs;
177 }
178 };
179 template<typename DType, PacketArch Arch>
180 struct PacketOp<op::mul, DType, Arch> {
181 static const bool kEnabled = true;
182 MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
183 const Packet<DType, Arch>& rhs) {
184 return lhs * rhs;
185 }
186 };
187 template<typename DType, PacketArch Arch>
188 struct PacketOp<op::div, DType, Arch> {
189 static const bool kEnabled = true;
190 MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
191 const Packet<DType, Arch>& rhs) {
192 return lhs / rhs;
193 }
194 };
195
196 template<typename DType, PacketArch Arch>
197 struct PacketOp<op::identity, DType, Arch> {
198 static const bool kEnabled = true;
199 MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& src) {
200 return src;
201 }
202 };
203
204
205 // savers to do storage
206 template<typename SV, typename TFloat, PacketArch Arch>
207 struct Saver{
208 MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
209 Packet<TFloat, Arch> lhs = Packet<TFloat, Arch>::Load(dst);
210 Packet<TFloat, Arch> ans = PacketOp<typename SV::OPType, TFloat, Arch>::Map(lhs, src);
211 ans.Store(dst);
212 }
213 };
214 template<typename TFloat, PacketArch Arch>
215 struct Saver<sv::saveto, TFloat, Arch> {
216 MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
217 src.Store(dst);
218 }
219 };
220 } // namespace packet
221 } // namespace mshadow
222
223 #include "packet/plain-inl.h"
224 #if MSHADOW_USE_SSE && !defined(__CUDACC__)
225 #include "packet/sse-inl.h"
226 #endif
227
228 namespace mshadow {
229 namespace expr {
230
231 typedef packet::PacketArch PacketArch;
232
233 // same as plan, but use packet
234 template<typename ExpType, typename DType, PacketArch Arch>
235 class PacketPlan {
236 public:
237 /*!
238 * \brief evaluate the expression at index [y][x],
239 * x will be aligned to Packet<DType, Arch>::Size()
240 */
241 MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const;
242 MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
243 };
244
245 template <typename Device, int dim, typename DType, PacketArch Arch>
246 class PacketPlan<Tensor<Device, dim, DType>, DType, Arch> {
247 public:
248 explicit PacketPlan(const Tensor<Device, dim, DType> &t)
249 :dptr_(t.dptr_), stride_(t.stride_) {}
250 MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
251 return packet::Packet<DType, Arch>::Load(&dptr_[y * stride_ + x]);
252 }
253 MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
254 return dptr_[y * stride_ + x];
255 }
256
257 private:
258 const DType *dptr_;
259 index_t stride_;
260 };
261
262 template<typename DType, PacketArch Arch>
263 class PacketPlan<ScalarExp<DType>, DType, Arch> {
264 public:
265 explicit PacketPlan(DType scalar) : scalar_(scalar) {}
266 MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
267 return packet::Packet<DType, Arch>::Fill(scalar_);
268 }
269 MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
270 return scalar_;
271 }
272
273 private:
274 DType scalar_;
275 };
276
277 template<typename OP, typename TA, typename TB, int etype, typename DType, PacketArch Arch>
278 class PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch> {
279 public:
280 PacketPlan(const PacketPlan<TA, DType, Arch> &lhs, const PacketPlan<TB, DType, Arch> &rhs)
281 : lhs_(lhs), rhs_(rhs) {}
282 MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
283 return packet::PacketOp<OP, DType, Arch>::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x));
284 }
285 MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
286 return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
287 }
288
289 private:
290 PacketPlan<TA, DType, Arch> lhs_;
291 PacketPlan<TB, DType, Arch> rhs_;
292 };
293
294 template<typename OP, typename TA, int etype, typename DType, PacketArch Arch>
295 class PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch> {
296 public:
297 PacketPlan(const PacketPlan<TA, DType, Arch> &src) : src_(src) {}
298 MSHADOW_CINLINE packet::Packet<DType> EvalPacket(index_t y, index_t x) const {
299 return packet::PacketOp<OP, DType, Arch>::Map(src_.EvalPacket(y, x));
300 }
301 MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
302 return OP::Map(src_.Eval(y, x));
303 }
304
305 private:
306 PacketPlan<TA, DType, Arch> src_;
307 };
308
309 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
310 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
311 MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
312
313 template<PacketArch Arch, typename DType>
314 inline PacketPlan<ScalarExp<DType>, DType, Arch> MakePacketPlan(const ScalarExp<DType> &e) {
315 return PacketPlan<ScalarExp<DType>, DType, Arch>(e.scalar_);
316 }
317 template<PacketArch Arch, typename T, typename DType>
318 inline PacketPlan<T, DType, Arch> MakePacketPlan(const RValueExp<T, DType> &e) {
319 return PacketPlan<T, DType, Arch>(e.self());
320 }
321 template<PacketArch Arch, typename T, int dim, typename DType>
322 inline PacketPlan<T, DType, Arch>
323 MakePacketPlan(const MakeTensorExp<T, cpu, dim, DType> &e) {
324 return PacketPlan<T, DType, Arch>(e.real_self());
325 }
326 template<PacketArch Arch, typename OP, typename TA, typename DType, int etype>
327 inline PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>
328 MakePacketPlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
329 return PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>(MakePacketPlan<Arch>(e.src_));
330 }
331 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
332 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
333 MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
334 return PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
335 DType, Arch>(MakePacketPlan<Arch>(e.lhs_), MakePacketPlan<Arch>(e.rhs_));
336 }
337
338 /*!
339 * \brief static check packet enable
340 *
341 * \tparam Device the type of Device
342 * \tparam dim dimension of the tensor
343 * \tparam E expression
344 */
345 template<typename E, PacketArch Arch>
346 struct PacketCheck{
347 static const bool kPass = false;
348 };
349 template<PacketArch Arch>
350 struct PacketCheck<float, Arch> {
351 static const bool kPass = true;
352 };
353 template<PacketArch Arch>
354 struct PacketCheck<double, Arch> {
355 static const bool kPass = true;
356 };
357 template<typename DType, PacketArch Arch>
358 struct PacketCheck<ScalarExp<DType>, Arch> {
359 static const bool kPass = PacketCheck<DType, Arch>::kPass;
360 };
361 template<int dim, typename DType, PacketArch Arch>
362 struct PacketCheck<Tensor<cpu, dim, DType>, Arch> {
363 static const bool kPass = PacketCheck<DType, Arch>::kPass;
364 };
365 template<typename OP, typename TA, typename DType, int etype, PacketArch Arch>
366 struct PacketCheck<UnaryMapExp<OP, TA, DType, etype>, Arch> {
367 static const bool kPass = PacketCheck<TA, Arch>::kPass &&
368 packet::PacketOp<OP, DType, Arch>::kEnabled;
369 };
370 template<typename OP, typename TA, typename TB, typename DType, int etype, PacketArch Arch>
371 struct PacketCheck< BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
372 static const bool kPass = packet::PacketOp<OP, DType, Arch>::kEnabled &&
373 PacketCheck<TA, Arch>::kPass && PacketCheck<TB, Arch>::kPass;
374 };
375 //----------------------------------------------------
376 // Check if data is aligned and allow packet operation
377 //----------------------------------------------------
378 template<int dim, typename E, PacketArch Arch>
379 struct PacketAlignCheck {
380 inline static bool Check(const E &exp) {
381 return false;
382 }
383 };
384 template<int dim, typename DType, PacketArch Arch>
385 struct PacketAlignCheck<dim, ScalarExp<DType>, Arch> {
386 inline static bool Check(const ScalarExp<DType> &exp) {
387 return true;
388 }
389 };
390 template<int dim, typename DType, PacketArch Arch>
391 struct PacketAlignCheck<dim, Tensor<cpu, dim, DType>, Arch> {
392 inline static bool Check(const Tensor<cpu, dim, DType> &t) {
393 return packet::CheckAlign<Arch>(t.dptr_) &&
394 packet::CheckAlign<Arch>(t.stride_ * sizeof(DType));
395 }
396 };
397 template<int dim, typename OP, typename TA, typename DType, int etype, PacketArch Arch>
398 struct PacketAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype>, Arch> {
399 inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
400 return PacketAlignCheck<dim, TA, Arch>::Check(t.src_);
401 }
402 };
403 template<int dim, typename OP, typename TA, typename TB,
404 typename DType, int etype, PacketArch Arch>
405 struct PacketAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
406 inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
407 return PacketAlignCheck<dim, TA, Arch>::Check(t.lhs_) &&
408 PacketAlignCheck<dim, TB, Arch>::Check(t.rhs_);
409 }
410 };
411
412 /*!
413 * \brief use PacketPlan to compute result
414 */
415 template<typename SV, typename E, int dim, typename DType, PacketArch Arch>
416 inline void MapPacketPlan(Tensor<cpu, dim, DType> _dst,
417 const expr::PacketPlan<E, DType, Arch>& plan) {
418 Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
419 const index_t xlen = packet::LowerAlign<DType, Arch>(dst.size(1));
420 const size_t packetSize = packet::Packet<DType, Arch>::size;
421 #ifndef __CUDACC__
422 #pragma omp parallel for
423 #endif
424 for (openmp_index_t y = 0; y < dst.size(0); ++y) {
425 for (index_t x = 0; x < xlen; x += packetSize) {
426 packet::Saver<SV, DType, Arch>::Save(&dst[y][x], plan.EvalPacket(y, x));
427 }
428 for (index_t x = xlen; x < dst.size(1); ++x) {
429 SV::Save(dst[y][x], plan.Eval(y, x));
430 }
431 }
432 }
433 } // namespace expr
434 } // namespace mshadow
435 #endif // MSHADOW_PACKET_INL_H_
436