/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /*! * \file packet-inl.h * \brief Generic packet vectorization code */ #ifndef MSHADOW_PACKET_INL_H_ #define MSHADOW_PACKET_INL_H_ #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__DragonFly__) #include #else #include #endif #include "./base.h" #include "./tensor.h" #include "./expression.h" namespace mshadow { /*! \brief namespace of packet math*/ namespace packet { enum PacketArch { kPlain, kSSE2, }; #if MSHADOW_USE_SSE #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kSSE2 #else #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kPlain #endif // whether packet operator is enabled. /*! * \brief Generic packet type * \tparam DType The data type of the packet. * \tparam Arch the Arch of the packet. */ template struct Packet; template struct AlignBytes { static const index_t value = 4; }; } // namespace packet } // namespace mshadow namespace mshadow { namespace packet { /*! * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells * \param out_pitch output parameter, the actuall space allocated for each line * \param lspace number of cells required for each line * \param num_line number of lines to be allocated */ inline void* AlignedMallocPitch(size_t *out_pitch, size_t lspace, size_t num_line) { const index_t bits = AlignBytes::value; const index_t mask = (1 << bits) - 1; size_t pitch = ((lspace + mask) >> bits) << bits; *out_pitch = pitch; #ifdef _MSC_VER void *res = _aligned_malloc(pitch * num_line, 1 << bits); #else void *res; int ret = posix_memalign(&res, 1 << bits, pitch * num_line); CHECK_EQ(ret, 0) << "AlignedMallocPitch failed"; #endif if (res == NULL) { LOG(FATAL) << "AlignedMallocPitch failed"; } #if __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif return res; #pragma GCC diagnostic pop } /*! * \brief free aligned space * \param ptr pointer to space to be freed */ inline void AlignedFree(void *ptr) { #ifdef _MSC_VER _aligned_free(ptr); #else free(ptr); #endif } /*! \brief check if a pointer is aligned */ template inline bool CheckAlign(size_t pitch) { const index_t bits = AlignBytes::value; return !(pitch & ((1 << bits) - 1)); } /*! \brief check if a pointer is aligned */ template inline bool CheckAlign(void *ptr) { return CheckAlign(reinterpret_cast(ptr)); } /*! * \brief get upper bound of aligned index of size * \param size size of the array * \param fsize size of float */ template inline index_t UpperAlign(index_t size) { const index_t bits = AlignBytes::value; const index_t mask = (1 << bits) - 1; const index_t fsize = sizeof(DType); return (((size * fsize + mask) >> bits) << bits) / fsize; } /*! * \brief get lower bound of aligned index of size * \param size size of the array * \param fsize size of float */ template inline index_t LowerAlign(index_t size) { const index_t bits = AlignBytes::value; const index_t fsize = sizeof(DType); return (((size * fsize) >> bits) << bits) / fsize; } /*! * \brief generic Packet operator * \tparam OP The operator * \tparam DType The data type * \tparam Arch The architecture. */ template struct PacketOp { static const bool kEnabled = false; }; // specialization of operators template struct PacketOp { static const bool kEnabled = true; MSHADOW_CINLINE static Packet Map(const Packet& lhs, const Packet& rhs) { return lhs + rhs; } }; template struct PacketOp { static const bool kEnabled = true; MSHADOW_CINLINE static Packet Map(const Packet& lhs, const Packet& rhs) { return lhs - rhs; } }; template struct PacketOp { static const bool kEnabled = true; MSHADOW_CINLINE static Packet Map(const Packet& lhs, const Packet& rhs) { return lhs * rhs; } }; template struct PacketOp { static const bool kEnabled = true; MSHADOW_CINLINE static Packet Map(const Packet& lhs, const Packet& rhs) { return lhs / rhs; } }; template struct PacketOp { static const bool kEnabled = true; MSHADOW_CINLINE static Packet Map(const Packet& src) { return src; } }; // savers to do storage template struct Saver{ MSHADOW_CINLINE static void Save(TFloat *dst, const Packet& src) { Packet lhs = Packet::Load(dst); Packet ans = PacketOp::Map(lhs, src); ans.Store(dst); } }; template struct Saver { MSHADOW_CINLINE static void Save(TFloat *dst, const Packet& src) { src.Store(dst); } }; } // namespace packet } // namespace mshadow #include "packet/plain-inl.h" #if MSHADOW_USE_SSE && !defined(__CUDACC__) #include "packet/sse-inl.h" #endif namespace mshadow { namespace expr { typedef packet::PacketArch PacketArch; // same as plan, but use packet template class PacketPlan { public: /*! * \brief evaluate the expression at index [y][x], * x will be aligned to Packet::Size() */ MSHADOW_CINLINE packet::Packet EvalPacket(index_t y, index_t x) const; MSHADOW_CINLINE DType Eval(index_t y, index_t x) const; }; template class PacketPlan, DType, Arch> { public: explicit PacketPlan(const Tensor &t) :dptr_(t.dptr_), stride_(t.stride_) {} MSHADOW_CINLINE packet::Packet EvalPacket(index_t y, index_t x) const { return packet::Packet::Load(&dptr_[y * stride_ + x]); } MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { return dptr_[y * stride_ + x]; } private: const DType *dptr_; index_t stride_; }; template class PacketPlan, DType, Arch> { public: explicit PacketPlan(DType scalar) : scalar_(scalar) {} MSHADOW_CINLINE packet::Packet EvalPacket(index_t y, index_t x) const { return packet::Packet::Fill(scalar_); } MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { return scalar_; } private: DType scalar_; }; template class PacketPlan, DType, Arch> { public: PacketPlan(const PacketPlan &lhs, const PacketPlan &rhs) : lhs_(lhs), rhs_(rhs) {} MSHADOW_CINLINE packet::Packet EvalPacket(index_t y, index_t x) const { return packet::PacketOp::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x)); } MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x)); } private: PacketPlan lhs_; PacketPlan rhs_; }; template class PacketPlan, DType, Arch> { public: PacketPlan(const PacketPlan &src) : src_(src) {} MSHADOW_CINLINE packet::Packet EvalPacket(index_t y, index_t x) const { return packet::PacketOp::Map(src_.EvalPacket(y, x)); } MSHADOW_CINLINE DType Eval(index_t y, index_t x) const { return OP::Map(src_.Eval(y, x)); } private: PacketPlan src_; }; template inline PacketPlan, DType, Arch> MakePacketPlan(const BinaryMapExp &e); template inline PacketPlan, DType, Arch> MakePacketPlan(const ScalarExp &e) { return PacketPlan, DType, Arch>(e.scalar_); } template inline PacketPlan MakePacketPlan(const RValueExp &e) { return PacketPlan(e.self()); } template inline PacketPlan MakePacketPlan(const MakeTensorExp &e) { return PacketPlan(e.real_self()); } template inline PacketPlan, DType, Arch> MakePacketPlan(const UnaryMapExp &e) { return PacketPlan, DType, Arch>(MakePacketPlan(e.src_)); } template inline PacketPlan, DType, Arch> MakePacketPlan(const BinaryMapExp &e) { return PacketPlan, DType, Arch>(MakePacketPlan(e.lhs_), MakePacketPlan(e.rhs_)); } /*! * \brief static check packet enable * * \tparam Device the type of Device * \tparam dim dimension of the tensor * \tparam E expression */ template struct PacketCheck{ static const bool kPass = false; }; template struct PacketCheck { static const bool kPass = true; }; template struct PacketCheck { static const bool kPass = true; }; template struct PacketCheck, Arch> { static const bool kPass = PacketCheck::kPass; }; template struct PacketCheck, Arch> { static const bool kPass = PacketCheck::kPass; }; template struct PacketCheck, Arch> { static const bool kPass = PacketCheck::kPass && packet::PacketOp::kEnabled; }; template struct PacketCheck< BinaryMapExp, Arch> { static const bool kPass = packet::PacketOp::kEnabled && PacketCheck::kPass && PacketCheck::kPass; }; //---------------------------------------------------- // Check if data is aligned and allow packet operation //---------------------------------------------------- template struct PacketAlignCheck { inline static bool Check(const E &exp) { return false; } }; template struct PacketAlignCheck, Arch> { inline static bool Check(const ScalarExp &exp) { return true; } }; template struct PacketAlignCheck, Arch> { inline static bool Check(const Tensor &t) { return packet::CheckAlign(t.dptr_) && packet::CheckAlign(t.stride_ * sizeof(DType)); } }; template struct PacketAlignCheck, Arch> { inline static bool Check(const UnaryMapExp &t) { return PacketAlignCheck::Check(t.src_); } }; template struct PacketAlignCheck, Arch> { inline static bool Check(const BinaryMapExp &t) { return PacketAlignCheck::Check(t.lhs_) && PacketAlignCheck::Check(t.rhs_); } }; /*! * \brief use PacketPlan to compute result */ template inline void MapPacketPlan(Tensor _dst, const expr::PacketPlan& plan) { Tensor dst = _dst.FlatTo2D(); const index_t xlen = packet::LowerAlign(dst.size(1)); const size_t packetSize = packet::Packet::size; #ifndef __CUDACC__ #pragma omp parallel for #endif for (openmp_index_t y = 0; y < dst.size(0); ++y) { for (index_t x = 0; x < xlen; x += packetSize) { packet::Saver::Save(&dst[y][x], plan.EvalPacket(y, x)); } for (index_t x = xlen; x < dst.size(1); ++x) { SV::Save(dst[y][x], plan.Eval(y, x)); } } } } // namespace expr } // namespace mshadow #endif // MSHADOW_PACKET_INL_H_