1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014, D. R. Commander.
5  * All rights reserved.
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* SLOW INTEGER FORWARD DCT */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 #define F_0_298 2446   /* FIX(0.298631336) */
29 #define F_0_390 3196   /* FIX(0.390180644) */
30 #define F_0_541 4433   /* FIX(0.541196100) */
31 #define F_0_765 6270   /* FIX(0.765366865) */
32 #define F_0_899 7373   /* FIX(0.899976223) */
33 #define F_1_175 9633   /* FIX(1.175875602) */
34 #define F_1_501 12299  /* FIX(1.501321110) */
35 #define F_1_847 15137  /* FIX(1.847759065) */
36 #define F_1_961 16069  /* FIX(1.961570560) */
37 #define F_2_053 16819  /* FIX(2.053119869) */
38 #define F_2_562 20995  /* FIX(2.562915447) */
39 #define F_3_072 25172  /* FIX(3.072711026) */
40 
41 #define CONST_BITS 13
42 #define PASS1_BITS 2
43 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
45 
46 
47 #define DO_FDCT_COMMON(PASS)  \
48 {  \
49   /* (Original)  \
50    * z1 = (tmp12 + tmp13) * 0.541196100;  \
51    * data2 = z1 + tmp13 * 0.765366865;  \
52    * data6 = z1 + tmp12 * -1.847759065;  \
53    *  \
54    * (This implementation)  \
55    * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
56    * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
57    */  \
58   \
59   tmp1312l = vec_mergeh(tmp13, tmp12);  \
60   tmp1312h = vec_mergel(tmp13, tmp12);  \
61   \
62   out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
63   out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
64   out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
65   out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
66   \
67   out2l = vec_sra(out2l, descale_p##PASS);  \
68   out2h = vec_sra(out2h, descale_p##PASS);  \
69   out6l = vec_sra(out6l, descale_p##PASS);  \
70   out6h = vec_sra(out6h, descale_p##PASS);  \
71   \
72   out2 = vec_pack(out2l, out2h);  \
73   out6 = vec_pack(out6l, out6h);  \
74   \
75   /* Odd part */  \
76   \
77   z3 = vec_add(tmp4, tmp6);  \
78   z4 = vec_add(tmp5, tmp7);  \
79   \
80   /* (Original)  \
81    * z5 = (z3 + z4) * 1.175875602;  \
82    * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
83    * z3 += z5;  z4 += z5;  \
84    *  \
85    * (This implementation)  \
86    * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
87    * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
88    */  \
89   \
90   z34l = vec_mergeh(z3, z4);  \
91   z34h = vec_mergel(z3, z4);  \
92   \
93   z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
94   z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
95   z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
96   z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
97   \
98   /* (Original)  \
99    * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
100    * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
101    * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
102    * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
103    * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
104    * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
105    *  \
106    * (This implementation)  \
107    * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
108    * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
109    * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
110    * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
111    * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
112    * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
113    */  \
114   \
115   tmp47l = vec_mergeh(tmp4, tmp7);  \
116   tmp47h = vec_mergel(tmp4, tmp7);  \
117   \
118   out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
119   out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
120   out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
121   out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
122   \
123   out7l = vec_sra(out7l, descale_p##PASS);  \
124   out7h = vec_sra(out7h, descale_p##PASS);  \
125   out1l = vec_sra(out1l, descale_p##PASS);  \
126   out1h = vec_sra(out1h, descale_p##PASS);  \
127   \
128   out7 = vec_pack(out7l, out7h);  \
129   out1 = vec_pack(out1l, out1h);  \
130   \
131   tmp56l = vec_mergeh(tmp5, tmp6);  \
132   tmp56h = vec_mergel(tmp5, tmp6);  \
133   \
134   out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
135   out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
136   out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
137   out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
138   \
139   out5l = vec_sra(out5l, descale_p##PASS);  \
140   out5h = vec_sra(out5h, descale_p##PASS);  \
141   out3l = vec_sra(out3l, descale_p##PASS);  \
142   out3h = vec_sra(out3h, descale_p##PASS);  \
143   \
144   out5 = vec_pack(out5l, out5h);  \
145   out3 = vec_pack(out3l, out3h);  \
146 }
147 
148 #define DO_FDCT_PASS1()  \
149 {  \
150   /* Even part */  \
151   \
152   tmp10 = vec_add(tmp0, tmp3);  \
153   tmp13 = vec_sub(tmp0, tmp3);  \
154   tmp11 = vec_add(tmp1, tmp2);  \
155   tmp12 = vec_sub(tmp1, tmp2);  \
156   \
157   out0  = vec_add(tmp10, tmp11);  \
158   out0  = vec_sl(out0, pass1_bits);  \
159   out4  = vec_sub(tmp10, tmp11);  \
160   out4  = vec_sl(out4, pass1_bits);  \
161   \
162   DO_FDCT_COMMON(1);  \
163 }
164 
165 #define DO_FDCT_PASS2()  \
166 {  \
167   /* Even part */  \
168   \
169   tmp10 = vec_add(tmp0, tmp3);  \
170   tmp13 = vec_sub(tmp0, tmp3);  \
171   tmp11 = vec_add(tmp1, tmp2);  \
172   tmp12 = vec_sub(tmp1, tmp2);  \
173   \
174   out0  = vec_add(tmp10, tmp11);  \
175   out0  = vec_add(out0, pw_descale_p2x);  \
176   out0  = vec_sra(out0, pass1_bits);  \
177   out4  = vec_sub(tmp10, tmp11);  \
178   out4  = vec_add(out4, pw_descale_p2x);  \
179   out4  = vec_sra(out4, pass1_bits);  \
180   \
181   DO_FDCT_COMMON(2);  \
182 }
183 
184 
185 void
jsimd_fdct_islow_altivec(DCTELEM * data)186 jsimd_fdct_islow_altivec (DCTELEM *data)
187 {
188   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
189     col0, col1, col2, col3, col4, col5, col6, col7,
190     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
191     tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
192     z3, z4, z34l, z34h,
193     out0, out1, out2, out3, out4, out5, out6, out7;
194   __vector int z3l, z3h, z4l, z4h,
195     out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
196     out7l, out7h;
197 
198   /* Constants */
199   __vector short
200     pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
201     pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
202     pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
203     pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
204     pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
205     pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
206     pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
207     pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
208     pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
209   __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
210   __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
211     pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
212   __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
213     descale_p2 = { __4X(DESCALE_P2) };
214 
215   /* Pass 1: process rows */
216 
217   row0 = vec_ld(0, data);
218   row1 = vec_ld(16, data);
219   row2 = vec_ld(32, data);
220   row3 = vec_ld(48, data);
221   row4 = vec_ld(64, data);
222   row5 = vec_ld(80, data);
223   row6 = vec_ld(96, data);
224   row7 = vec_ld(112, data);
225 
226   TRANSPOSE(row, col);
227 
228   tmp0 = vec_add(col0, col7);
229   tmp7 = vec_sub(col0, col7);
230   tmp1 = vec_add(col1, col6);
231   tmp6 = vec_sub(col1, col6);
232   tmp2 = vec_add(col2, col5);
233   tmp5 = vec_sub(col2, col5);
234   tmp3 = vec_add(col3, col4);
235   tmp4 = vec_sub(col3, col4);
236 
237   DO_FDCT_PASS1();
238 
239   /* Pass 2: process columns */
240 
241   TRANSPOSE(out, row);
242 
243   tmp0 = vec_add(row0, row7);
244   tmp7 = vec_sub(row0, row7);
245   tmp1 = vec_add(row1, row6);
246   tmp6 = vec_sub(row1, row6);
247   tmp2 = vec_add(row2, row5);
248   tmp5 = vec_sub(row2, row5);
249   tmp3 = vec_add(row3, row4);
250   tmp4 = vec_sub(row3, row4);
251 
252   DO_FDCT_PASS2();
253 
254   vec_st(out0, 0, data);
255   vec_st(out1, 16, data);
256   vec_st(out2, 32, data);
257   vec_st(out3, 48, data);
258   vec_st(out4, 64, data);
259   vec_st(out5, 80, data);
260   vec_st(out6, 96, data);
261   vec_st(out7, 112, data);
262 }
263