1 /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
9 #define LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/bit_andnot.h>
18 #include <simdpp/core/bit_or.h>
19 #include <simdpp/core/cmp_lt.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/detail/null/math.h>
22
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25
26 // FIXME: move to adv
27 /** Divides one 8-bit unsigned number by another. The precision of the operation
28 is configurable: only P least significant bits of both numerator and
29 denumerator are considered.
30
31 @code
32 r0 = num0 / den0
33 ...
34 rN = numN / denN
35 @endcode
36 @par 128-bit version:
37 The operations costs at least 9 instructions per bit of precision.
38
39 @par 256-bit version:
40 @icost{SSE2-AVX, NEON, 10}
41 @icost{AVX2, 4}
42 */
43 template<unsigned P> SIMDPP_INL
div_p(const uint8x16 & num,const uint8x16 & den)44 uint8x16 div_p(const uint8x16& num, const uint8x16& den)
45 {
46 #if SIMDPP_USE_NULL
47 return detail::null::div_p<P>(num, den);
48 #else
49 static_assert(P <= 8, "Precision too large");
50 uint8x16 r, q, bit_mask;
51 r = q = make_zero();
52 bit_mask = make_uint(1 << (P-1));
53
54 for (unsigned i = P; i > 0; i--) {
55 unsigned bit = i-1;
56 uint8x16 n_bit;
57 // we'll never shift out any bits, so larger shift doesn't matter
58 r = shift_l<1>((uint16x8)r);
59
60 n_bit = bit_and(num, bit_mask);
61 n_bit = shift_r((uint16x8)n_bit, bit);
62 r = bit_or(r, n_bit);
63
64 uint8x16 cmp, csub, cbit;
65 cmp = cmp_lt(r, den);
66
67 csub = bit_andnot(den, cmp);
68 cbit = bit_andnot(bit_mask, cmp);
69 r = sub(r, csub);
70 q = bit_or(q, cbit);
71
72 bit_mask = shift_r<1>((uint16x8)bit_mask);
73 }
74 return q;
75
76 /*
77 The actual algorithm is as follows:
78 N - numerator, D - denominator, R - remainder, Q - quetient
79 R = 0; Q = 0;
80 for (unsigned i = P; i > 0; i--) {
81 unsigned bit = i-1;
82 R <<= 1;
83 R |= (N >> bit) & 1;
84 if (R >= D) {
85 R = R - D;
86 Q |= 1 << bit;
87 }
88 }*/
89 #endif
90 }
91
92 template<unsigned P> SIMDPP_INL
div_p(const uint16x8 & num,const uint16x8 & den)93 uint16x8 div_p(const uint16x8& num, const uint16x8& den)
94 {
95 #if SIMDPP_USE_NULL
96 return detail::null::div_p<P>(num, den);
97 #else
98 static_assert(P <= 16, "Precision too large");
99 uint16x8 r, q, bit_mask;
100
101 r = q = make_zero();
102 bit_mask = make_uint(1 << (P-1));
103
104 for (unsigned i = P; i > 0; i--) {
105 unsigned bit = i-1; // TODO precision
106 uint16x8 n_bit;
107 r = shift_l<1>(r);
108
109 n_bit = bit_and(num, bit_mask);
110 n_bit = shift_r(n_bit, bit);
111 r = bit_or(r, n_bit);
112
113 uint16x8 cmp, csub, cbit;
114 cmp = cmp_lt(r, den);
115
116 csub = bit_andnot(den, cmp);
117 cbit = bit_andnot(bit_mask, cmp);
118 r = sub(r, csub);
119 q = bit_or(q, cbit);
120
121 bit_mask = shift_r<1>(bit_mask);
122 }
123 return q;
124 #endif
125 }
126
127 } // namespace SIMDPP_ARCH_NAMESPACE
128 } // namespace simdpp
129
130 #endif
131
132