1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file packet-inl.h
22  * \brief Generic packet vectorization code
23  */
24 #ifndef MSHADOW_PACKET_INL_H_
25 #define MSHADOW_PACKET_INL_H_
26 
27 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__)
28 #include <stdlib.h>
29 #else
30 #include <malloc.h>
31 #endif
32 #include "./base.h"
33 #include "./tensor.h"
34 #include "./expression.h"
35 
36 
37 namespace mshadow {
38 /*! \brief namespace of packet math*/
39 namespace packet {
40 
41 enum PacketArch {
42   kPlain,
43   kSSE2,
44 };
45 
46 #if MSHADOW_USE_SSE
47 #define MSHADOW_DEFAULT_PACKET  ::mshadow::packet::kSSE2
48 #else
49 #define MSHADOW_DEFAULT_PACKET  ::mshadow::packet::kPlain
50 #endif
51 
52 // whether packet operator is enabled.
53 /*!
54  * \brief Generic packet type
55  * \tparam DType The data type of the packet.
56  * \tparam Arch the Arch of the packet.
57  */
58 template<typename DType, PacketArch Arch = MSHADOW_DEFAULT_PACKET>
59 struct Packet;
60 
61 template<PacketArch Arch>
62 struct AlignBytes {
63   static const index_t value = 4;
64 };
65 
66 }  // namespace packet
67 }  // namespace mshadow
68 
69 namespace mshadow {
70 namespace packet {
71 /*!
72  * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
73  * \param out_pitch output parameter, the actuall space allocated for each line
74  * \param lspace number of cells required for each line
75  * \param num_line number of lines to be allocated
76  */
AlignedMallocPitch(size_t * out_pitch,size_t lspace,size_t num_line)77 inline void* AlignedMallocPitch(size_t *out_pitch,
78                                 size_t lspace,
79                                 size_t num_line) {
80   const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
81   const index_t mask = (1 << bits) - 1;
82 
83   size_t pitch = ((lspace + mask) >> bits) << bits;
84   *out_pitch = pitch;
85 #ifdef _MSC_VER
86   void *res = _aligned_malloc(pitch * num_line, 1 << bits);
87 #else
88   void *res;
89   int ret = posix_memalign(&res, 1 << bits, pitch * num_line);
90   CHECK_EQ(ret, 0) << "AlignedMallocPitch failed";
91 #endif
92   if (res == NULL) {
93     LOG(FATAL) << "AlignedMallocPitch failed";
94   }
95 #if __GNUC__ >= 6
96 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
97 #endif
98   return res;
99 #pragma GCC diagnostic pop
100 }
101 
102 /*!
103  * \brief free aligned space
104  * \param ptr pointer to space to be freed
105  */
AlignedFree(void * ptr)106 inline void AlignedFree(void *ptr) {
107 #ifdef _MSC_VER
108   _aligned_free(ptr);
109 #else
110   free(ptr);
111 #endif
112 }
113 
114 /*! \brief check if a pointer is aligned */
115 template<PacketArch Arch>
CheckAlign(size_t pitch)116 inline bool CheckAlign(size_t pitch) {
117   const index_t bits = AlignBytes<Arch>::value;
118   return !(pitch & ((1 << bits) - 1));
119 }
120 
121 /*! \brief check if a pointer is aligned */
122 template<PacketArch Arch>
CheckAlign(void * ptr)123 inline bool CheckAlign(void *ptr) {
124   return CheckAlign<Arch>(reinterpret_cast<size_t>(ptr));
125 }
126 
127 /*!
128  * \brief get upper bound of aligned index of size
129  * \param size size of the array
130  * \param fsize size of float
131  */
132 template<typename DType, PacketArch Arch>
UpperAlign(index_t size)133 inline index_t UpperAlign(index_t size) {
134   const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
135   const index_t mask = (1 << bits) - 1;
136   const index_t fsize = sizeof(DType);
137   return (((size * fsize + mask) >> bits) << bits) / fsize;
138 }
139 
140 /*!
141  * \brief get lower bound of aligned index of size
142  * \param size size of the array
143  * \param fsize size of float
144  */
145 template<typename DType, PacketArch Arch>
LowerAlign(index_t size)146 inline index_t LowerAlign(index_t size) {
147   const index_t bits = AlignBytes<MSHADOW_DEFAULT_PACKET>::value;
148   const index_t fsize = sizeof(DType);
149   return (((size * fsize) >> bits) << bits) / fsize;
150 }
151 
152 /*!
153  * \brief generic Packet operator
154  * \tparam OP The operator
155  * \tparam DType The data type
156  * \tparam Arch The architecture.
157  */
158 template<typename OP, typename DType, PacketArch Arch>
159 struct PacketOp {
160   static const bool kEnabled = false;
161 };
162 // specialization of operators
163 template<typename DType, PacketArch Arch>
164 struct PacketOp<op::plus, DType, Arch> {
165   static const bool kEnabled = true;
166   MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
167                                                    const Packet<DType, Arch>& rhs) {
168     return lhs + rhs;
169   }
170 };
171 template<typename DType, PacketArch Arch>
172 struct PacketOp<op::minus, DType, Arch> {
173   static const bool kEnabled = true;
174   MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
175                                                   const Packet<DType, Arch>& rhs) {
176     return lhs - rhs;
177   }
178 };
179 template<typename DType, PacketArch Arch>
180 struct PacketOp<op::mul, DType, Arch> {
181   static const bool kEnabled = true;
182   MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
183                                                   const Packet<DType, Arch>& rhs) {
184     return lhs * rhs;
185   }
186 };
187 template<typename DType, PacketArch Arch>
188 struct PacketOp<op::div, DType, Arch> {
189   static const bool kEnabled = true;
190   MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& lhs,
191                                                   const Packet<DType, Arch>& rhs) {
192     return lhs / rhs;
193   }
194 };
195 
196 template<typename DType, PacketArch Arch>
197 struct PacketOp<op::identity, DType, Arch> {
198   static const bool kEnabled = true;
199   MSHADOW_CINLINE static Packet<DType, Arch> Map(const Packet<DType, Arch>& src) {
200     return src;
201   }
202 };
203 
204 
205 // savers to do storage
206 template<typename SV, typename TFloat, PacketArch Arch>
207 struct Saver{
208   MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
209     Packet<TFloat, Arch> lhs = Packet<TFloat, Arch>::Load(dst);
210     Packet<TFloat, Arch> ans = PacketOp<typename SV::OPType, TFloat, Arch>::Map(lhs, src);
211     ans.Store(dst);
212   }
213 };
214 template<typename TFloat, PacketArch Arch>
215 struct Saver<sv::saveto, TFloat, Arch> {
216   MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
217     src.Store(dst);
218   }
219 };
220 }  // namespace packet
221 }  // namespace mshadow
222 
223 #include "packet/plain-inl.h"
224 #if MSHADOW_USE_SSE && !defined(__CUDACC__)
225 #include "packet/sse-inl.h"
226 #endif
227 
228 namespace mshadow {
229 namespace expr {
230 
231 typedef packet::PacketArch PacketArch;
232 
233 // same as plan, but use packet
234 template<typename ExpType, typename DType, PacketArch Arch>
235 class PacketPlan {
236  public:
237   /*!
238    * \brief evaluate the expression at index [y][x],
239    * x will be aligned to Packet<DType, Arch>::Size()
240    */
241   MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const;
242   MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
243 };
244 
245 template <typename Device, int dim, typename DType, PacketArch Arch>
246 class PacketPlan<Tensor<Device, dim, DType>, DType, Arch> {
247  public:
248   explicit PacketPlan(const Tensor<Device, dim, DType> &t)
249       :dptr_(t.dptr_), stride_(t.stride_) {}
250   MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
251     return packet::Packet<DType, Arch>::Load(&dptr_[y * stride_ + x]);
252   }
253   MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
254     return dptr_[y * stride_ + x];
255   }
256 
257  private:
258   const DType  *dptr_;
259   index_t stride_;
260 };
261 
262 template<typename DType, PacketArch Arch>
263 class PacketPlan<ScalarExp<DType>, DType, Arch> {
264  public:
265   explicit PacketPlan(DType scalar) : scalar_(scalar) {}
266   MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
267     return packet::Packet<DType, Arch>::Fill(scalar_);
268   }
269   MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
270     return scalar_;
271   }
272 
273  private:
274   DType scalar_;
275 };
276 
277 template<typename OP, typename TA, typename TB, int etype, typename DType, PacketArch Arch>
278 class PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch> {
279  public:
280   PacketPlan(const PacketPlan<TA, DType, Arch> &lhs, const PacketPlan<TB, DType, Arch> &rhs)
281       : lhs_(lhs), rhs_(rhs) {}
282   MSHADOW_CINLINE packet::Packet<DType, Arch> EvalPacket(index_t y, index_t x) const {
283     return packet::PacketOp<OP, DType, Arch>::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x));
284   }
285   MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
286     return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
287   }
288 
289  private:
290   PacketPlan<TA, DType, Arch> lhs_;
291   PacketPlan<TB, DType, Arch> rhs_;
292 };
293 
294 template<typename OP, typename TA, int etype, typename DType, PacketArch Arch>
295 class PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch> {
296  public:
297   PacketPlan(const PacketPlan<TA, DType, Arch> &src) : src_(src) {}
298   MSHADOW_CINLINE packet::Packet<DType> EvalPacket(index_t y, index_t x) const {
299     return packet::PacketOp<OP, DType, Arch>::Map(src_.EvalPacket(y, x));
300   }
301   MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
302     return OP::Map(src_.Eval(y, x));
303   }
304 
305  private:
306   PacketPlan<TA, DType, Arch> src_;
307 };
308 
309 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
310 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
311 MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e);
312 
313 template<PacketArch Arch, typename DType>
314 inline PacketPlan<ScalarExp<DType>, DType, Arch> MakePacketPlan(const ScalarExp<DType> &e) {
315   return PacketPlan<ScalarExp<DType>, DType, Arch>(e.scalar_);
316 }
317 template<PacketArch Arch, typename T, typename DType>
318 inline PacketPlan<T, DType, Arch> MakePacketPlan(const RValueExp<T, DType> &e) {
319   return PacketPlan<T, DType, Arch>(e.self());
320 }
321 template<PacketArch Arch, typename T, int dim, typename DType>
322 inline PacketPlan<T, DType, Arch>
323 MakePacketPlan(const MakeTensorExp<T, cpu, dim, DType> &e) {
324   return PacketPlan<T, DType, Arch>(e.real_self());
325 }
326 template<PacketArch Arch, typename OP, typename TA, typename DType, int etype>
327 inline PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>
328 MakePacketPlan(const UnaryMapExp<OP, TA, DType, etype> &e) {
329   return PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>(MakePacketPlan<Arch>(e.src_));
330 }
331 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
332 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
333 MakePacketPlan(const BinaryMapExp<OP, TA, TB, DType, etype> &e) {
334   return PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
335                     DType, Arch>(MakePacketPlan<Arch>(e.lhs_), MakePacketPlan<Arch>(e.rhs_));
336 }
337 
338 /*!
339  * \brief static check packet enable
340  *
341  * \tparam Device the type of Device
342  * \tparam dim dimension of the tensor
343  * \tparam E expression
344  */
345 template<typename E, PacketArch Arch>
346 struct PacketCheck{
347   static const bool kPass = false;
348 };
349 template<PacketArch Arch>
350 struct PacketCheck<float, Arch> {
351   static const bool kPass = true;
352 };
353 template<PacketArch Arch>
354 struct PacketCheck<double, Arch> {
355   static const bool kPass = true;
356 };
357 template<typename DType, PacketArch Arch>
358 struct PacketCheck<ScalarExp<DType>, Arch> {
359   static const bool kPass = PacketCheck<DType, Arch>::kPass;
360 };
361 template<int dim, typename DType, PacketArch Arch>
362 struct PacketCheck<Tensor<cpu, dim, DType>, Arch> {
363   static const bool kPass = PacketCheck<DType, Arch>::kPass;
364 };
365 template<typename OP, typename TA, typename DType, int etype, PacketArch Arch>
366 struct PacketCheck<UnaryMapExp<OP, TA, DType, etype>, Arch> {
367   static const bool kPass = PacketCheck<TA, Arch>::kPass &&
368       packet::PacketOp<OP, DType, Arch>::kEnabled;
369 };
370 template<typename OP, typename TA, typename TB, typename DType, int etype, PacketArch Arch>
371 struct PacketCheck< BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
372   static const bool kPass = packet::PacketOp<OP, DType, Arch>::kEnabled &&
373       PacketCheck<TA, Arch>::kPass && PacketCheck<TB, Arch>::kPass;
374 };
375 //----------------------------------------------------
376 // Check if data is aligned and allow packet operation
377 //----------------------------------------------------
378 template<int dim, typename E, PacketArch Arch>
379 struct PacketAlignCheck {
380   inline static bool Check(const E &exp) {
381     return false;
382   }
383 };
384 template<int dim, typename DType, PacketArch Arch>
385 struct PacketAlignCheck<dim, ScalarExp<DType>, Arch> {
386   inline static bool Check(const ScalarExp<DType> &exp) {
387     return true;
388   }
389 };
390 template<int dim, typename DType, PacketArch Arch>
391 struct PacketAlignCheck<dim, Tensor<cpu, dim, DType>, Arch> {
392   inline static bool Check(const Tensor<cpu, dim, DType> &t) {
393     return packet::CheckAlign<Arch>(t.dptr_) &&
394         packet::CheckAlign<Arch>(t.stride_ * sizeof(DType));
395   }
396 };
397 template<int dim, typename OP, typename TA, typename DType, int etype, PacketArch Arch>
398 struct PacketAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype>, Arch> {
399   inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
400     return PacketAlignCheck<dim, TA, Arch>::Check(t.src_);
401   }
402 };
403 template<int dim, typename OP, typename TA, typename TB,
404          typename DType, int etype, PacketArch Arch>
405 struct PacketAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
406   inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
407     return PacketAlignCheck<dim, TA, Arch>::Check(t.lhs_) &&
408         PacketAlignCheck<dim, TB, Arch>::Check(t.rhs_);
409   }
410 };
411 
412 /*!
413  * \brief use PacketPlan to compute result
414  */
415 template<typename SV, typename E, int dim, typename DType, PacketArch Arch>
416 inline void MapPacketPlan(Tensor<cpu, dim, DType> _dst,
417                           const expr::PacketPlan<E, DType, Arch>& plan) {
418   Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
419   const index_t xlen = packet::LowerAlign<DType, Arch>(dst.size(1));
420   const size_t packetSize = packet::Packet<DType, Arch>::size;
421 #ifndef __CUDACC__
422   #pragma omp parallel for
423 #endif
424   for (openmp_index_t y = 0; y < dst.size(0); ++y) {
425     for (index_t x = 0; x < xlen; x += packetSize) {
426       packet::Saver<SV, DType, Arch>::Save(&dst[y][x], plan.EvalPacket(y, x));
427     }
428     for (index_t x = xlen; x < dst.size(1); ++x) {
429       SV::Save(dst[y][x], plan.Eval(y, x));
430     }
431   }
432 }
433 }  // namespace expr
434 }  // namespace mshadow
435 #endif  // MSHADOW_PACKET_INL_H_
436