1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* FAST INTEGER FORWARD DCT
24  *
25  * This is similar to the SSE2 implementation, except that we left-shift the
26  * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
27  * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28  *   the elements in arg3 + the most significant 17 bits of
29  *     (the elements in arg1 * the elements in arg2).
30  */
31 
32 #include "jsimd_altivec.h"
33 
34 
35 #define F_0_382  98   /* FIX(0.382683433) */
36 #define F_0_541  139  /* FIX(0.541196100) */
37 #define F_0_707  181  /* FIX(0.707106781) */
38 #define F_1_306  334  /* FIX(1.306562965) */
39 
40 #define CONST_BITS  8
41 #define PRE_MULTIPLY_SCALE_BITS  2
42 #define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
43 
44 
45 #define DO_FDCT() { \
46   /* Even part */ \
47   \
48   tmp10 = vec_add(tmp0, tmp3); \
49   tmp13 = vec_sub(tmp0, tmp3); \
50   tmp11 = vec_add(tmp1, tmp2); \
51   tmp12 = vec_sub(tmp1, tmp2); \
52   \
53   out0  = vec_add(tmp10, tmp11); \
54   out4  = vec_sub(tmp10, tmp11); \
55   \
56   z1 = vec_add(tmp12, tmp13); \
57   z1 = vec_sl(z1, pre_multiply_scale_bits); \
58   z1 = vec_madds(z1, pw_0707, pw_zero); \
59   \
60   out2 = vec_add(tmp13, z1); \
61   out6 = vec_sub(tmp13, z1); \
62   \
63   /* Odd part */ \
64   \
65   tmp10 = vec_add(tmp4, tmp5); \
66   tmp11 = vec_add(tmp5, tmp6); \
67   tmp12 = vec_add(tmp6, tmp7); \
68   \
69   tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
70   tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
71   z5 = vec_sub(tmp10, tmp12); \
72   z5 = vec_madds(z5, pw_0382, pw_zero); \
73   \
74   z2 = vec_madds(tmp10, pw_0541, z5); \
75   z4 = vec_madds(tmp12, pw_1306, z5); \
76   \
77   tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
78   z3 = vec_madds(tmp11, pw_0707, pw_zero); \
79   \
80   z11 = vec_add(tmp7, z3); \
81   z13 = vec_sub(tmp7, z3); \
82   \
83   out5 = vec_add(z13, z2); \
84   out3 = vec_sub(z13, z2); \
85   out1 = vec_add(z11, z4); \
86   out7 = vec_sub(z11, z4); \
87 }
88 
89 
jsimd_fdct_ifast_altivec(DCTELEM * data)90 void jsimd_fdct_ifast_altivec(DCTELEM *data)
91 {
92   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
93     col0, col1, col2, col3, col4, col5, col6, col7,
94     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
95     z1, z2, z3, z4, z5, z11, z13,
96     out0, out1, out2, out3, out4, out5, out6, out7;
97 
98   /* Constants */
99   __vector short pw_zero = { __8X(0) },
100     pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
101     pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
102     pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
103     pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
104   __vector unsigned short
105     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
106 
107   /* Pass 1: process rows */
108 
109   row0 = vec_ld(0, data);
110   row1 = vec_ld(16, data);
111   row2 = vec_ld(32, data);
112   row3 = vec_ld(48, data);
113   row4 = vec_ld(64, data);
114   row5 = vec_ld(80, data);
115   row6 = vec_ld(96, data);
116   row7 = vec_ld(112, data);
117 
118   TRANSPOSE(row, col);
119 
120   tmp0 = vec_add(col0, col7);
121   tmp7 = vec_sub(col0, col7);
122   tmp1 = vec_add(col1, col6);
123   tmp6 = vec_sub(col1, col6);
124   tmp2 = vec_add(col2, col5);
125   tmp5 = vec_sub(col2, col5);
126   tmp3 = vec_add(col3, col4);
127   tmp4 = vec_sub(col3, col4);
128 
129   DO_FDCT();
130 
131   /* Pass 2: process columns */
132 
133   TRANSPOSE(out, row);
134 
135   tmp0 = vec_add(row0, row7);
136   tmp7 = vec_sub(row0, row7);
137   tmp1 = vec_add(row1, row6);
138   tmp6 = vec_sub(row1, row6);
139   tmp2 = vec_add(row2, row5);
140   tmp5 = vec_sub(row2, row5);
141   tmp3 = vec_add(row3, row4);
142   tmp4 = vec_sub(row3, row4);
143 
144   DO_FDCT();
145 
146   vec_st(out0, 0, data);
147   vec_st(out1, 16, data);
148   vec_st(out2, 32, data);
149   vec_st(out3, 48, data);
150   vec_st(out4, 64, data);
151   vec_st(out5, 80, data);
152   vec_st(out6, 96, data);
153   vec_st(out7, 112, data);
154 }
155