1 /*  Copyright (C) 2013-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
9 #define LIBSIMDPP_SIMDPP_CORE_I_DIV_P_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/bit_andnot.h>
18 #include <simdpp/core/bit_or.h>
19 #include <simdpp/core/cmp_lt.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/detail/null/math.h>
22 
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 
26 // FIXME: move to adv
27 /** Divides one 8-bit unsigned number by another. The precision of the operation
28     is configurable: only P least significant bits of both numerator and
29     denumerator are considered.
30 
31     @code
32     r0 = num0 / den0
33     ...
34     rN = numN / denN
35     @endcode
36     @par 128-bit version:
37     The operations costs at least 9 instructions per bit of precision.
38 
39     @par 256-bit version:
40     @icost{SSE2-AVX, NEON, 10}
41     @icost{AVX2, 4}
42 */
43 template<unsigned P> SIMDPP_INL
div_p(const uint8x16 & num,const uint8x16 & den)44 uint8x16 div_p(const uint8x16& num, const uint8x16& den)
45 {
46 #if SIMDPP_USE_NULL
47     return detail::null::div_p<P>(num, den);
48 #else
49     static_assert(P <= 8, "Precision too large");
50     uint8x16 r, q, bit_mask;
51     r = q = make_zero();
52     bit_mask = make_uint(1 << (P-1));
53 
54     for (unsigned i = P; i > 0; i--) {
55         unsigned bit = i-1;
56         uint8x16 n_bit;
57         // we'll never shift out any bits, so larger shift doesn't matter
58         r = shift_l<1>((uint16x8)r);
59 
60         n_bit = bit_and(num, bit_mask);
61         n_bit = shift_r((uint16x8)n_bit, bit);
62         r = bit_or(r, n_bit);
63 
64         uint8x16 cmp, csub, cbit;
65         cmp = cmp_lt(r, den);
66 
67         csub = bit_andnot(den, cmp);
68         cbit = bit_andnot(bit_mask, cmp);
69         r = sub(r, csub);
70         q = bit_or(q, cbit);
71 
72         bit_mask = shift_r<1>((uint16x8)bit_mask);
73     }
74     return q;
75 
76     /*
77     The actual algorithm is as follows:
78     N - numerator, D - denominator, R - remainder, Q - quetient
79     R = 0; Q = 0;
80     for (unsigned i = P; i > 0; i--) {
81         unsigned bit = i-1;
82         R <<= 1;
83         R |= (N >> bit) & 1;
84         if (R >= D) {
85             R = R - D;
86             Q |= 1 << bit;
87         }
88     }*/
89 #endif
90 }
91 
92 template<unsigned P> SIMDPP_INL
div_p(const uint16x8 & num,const uint16x8 & den)93 uint16x8 div_p(const uint16x8& num, const uint16x8& den)
94 {
95 #if SIMDPP_USE_NULL
96     return detail::null::div_p<P>(num, den);
97 #else
98     static_assert(P <= 16, "Precision too large");
99     uint16x8 r, q, bit_mask;
100 
101     r = q = make_zero();
102     bit_mask = make_uint(1 << (P-1));
103 
104     for (unsigned i = P; i > 0; i--) {
105         unsigned bit = i-1; // TODO precision
106         uint16x8 n_bit;
107         r = shift_l<1>(r);
108 
109         n_bit = bit_and(num, bit_mask);
110         n_bit = shift_r(n_bit, bit);
111         r = bit_or(r, n_bit);
112 
113         uint16x8 cmp, csub, cbit;
114         cmp = cmp_lt(r, den);
115 
116         csub = bit_andnot(den, cmp);
117         cbit = bit_andnot(bit_mask, cmp);
118         r = sub(r, csub);
119         q = bit_or(q, cbit);
120 
121         bit_mask = shift_r<1>(bit_mask);
122     }
123     return q;
124 #endif
125 }
126 
127 } // namespace SIMDPP_ARCH_NAMESPACE
128 } // namespace simdpp
129 
130 #endif
131 
132