1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014, D. R. Commander.
5  * All rights reserved.
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* FAST INTEGER FORWARD DCT
24  *
25  * This is similar to the SSE2 implementation, except that we left-shift the
26  * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
27  * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28  *   the elements in arg3 + the most significant 17 bits of
29  *     (the elements in arg1 * the elements in arg2).
30  */
31 
32 #include "jsimd_altivec.h"
33 
34 
35 #define F_0_382 98   /* FIX(0.382683433) */
36 #define F_0_541 139  /* FIX(0.541196100) */
37 #define F_0_707 181  /* FIX(0.707106781) */
38 #define F_1_306 334  /* FIX(1.306562965) */
39 
40 #define CONST_BITS 8
41 #define PRE_MULTIPLY_SCALE_BITS 2
42 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
43 
44 
45 #define DO_FDCT()  \
46 {  \
47   /* Even part */  \
48   \
49   tmp10 = vec_add(tmp0, tmp3);  \
50   tmp13 = vec_sub(tmp0, tmp3);  \
51   tmp11 = vec_add(tmp1, tmp2);  \
52   tmp12 = vec_sub(tmp1, tmp2);  \
53   \
54   out0  = vec_add(tmp10, tmp11);  \
55   out4  = vec_sub(tmp10, tmp11);  \
56   \
57   z1 = vec_add(tmp12, tmp13);  \
58   z1 = vec_sl(z1, pre_multiply_scale_bits);  \
59   z1 = vec_madds(z1, pw_0707, pw_zero);  \
60   \
61   out2 = vec_add(tmp13, z1);  \
62   out6 = vec_sub(tmp13, z1);  \
63   \
64   /* Odd part */  \
65   \
66   tmp10 = vec_add(tmp4, tmp5);  \
67   tmp11 = vec_add(tmp5, tmp6);  \
68   tmp12 = vec_add(tmp6, tmp7);  \
69   \
70   tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
71   tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
72   z5 = vec_sub(tmp10, tmp12);  \
73   z5 = vec_madds(z5, pw_0382, pw_zero);  \
74   \
75   z2 = vec_madds(tmp10, pw_0541, z5);  \
76   z4 = vec_madds(tmp12, pw_1306, z5);  \
77   \
78   tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
79   z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
80   \
81   z11 = vec_add(tmp7, z3);  \
82   z13 = vec_sub(tmp7, z3);  \
83   \
84   out5 = vec_add(z13, z2);  \
85   out3 = vec_sub(z13, z2);  \
86   out1 = vec_add(z11, z4);  \
87   out7 = vec_sub(z11, z4);  \
88 }
89 
90 
91 void
jsimd_fdct_ifast_altivec(DCTELEM * data)92 jsimd_fdct_ifast_altivec (DCTELEM *data)
93 {
94   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
95     col0, col1, col2, col3, col4, col5, col6, col7,
96     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
97     z1, z2, z3, z4, z5, z11, z13,
98     out0, out1, out2, out3, out4, out5, out6, out7;
99 
100   /* Constants */
101   __vector short pw_zero = { __8X(0) },
102     pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
103     pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
104     pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
105     pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
106   __vector unsigned short
107     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
108 
109   /* Pass 1: process rows */
110 
111   row0 = vec_ld(0, data);
112   row1 = vec_ld(16, data);
113   row2 = vec_ld(32, data);
114   row3 = vec_ld(48, data);
115   row4 = vec_ld(64, data);
116   row5 = vec_ld(80, data);
117   row6 = vec_ld(96, data);
118   row7 = vec_ld(112, data);
119 
120   TRANSPOSE(row, col);
121 
122   tmp0 = vec_add(col0, col7);
123   tmp7 = vec_sub(col0, col7);
124   tmp1 = vec_add(col1, col6);
125   tmp6 = vec_sub(col1, col6);
126   tmp2 = vec_add(col2, col5);
127   tmp5 = vec_sub(col2, col5);
128   tmp3 = vec_add(col3, col4);
129   tmp4 = vec_sub(col3, col4);
130 
131   DO_FDCT();
132 
133   /* Pass 2: process columns */
134 
135   TRANSPOSE(out, row);
136 
137   tmp0 = vec_add(row0, row7);
138   tmp7 = vec_sub(row0, row7);
139   tmp1 = vec_add(row1, row6);
140   tmp6 = vec_sub(row1, row6);
141   tmp2 = vec_add(row2, row5);
142   tmp5 = vec_sub(row2, row5);
143   tmp3 = vec_add(row3, row4);
144   tmp4 = vec_sub(row3, row4);
145 
146   DO_FDCT();
147 
148   vec_st(out0, 0, data);
149   vec_st(out1, 16, data);
150   vec_st(out2, 32, data);
151   vec_st(out3, 48, data);
152   vec_st(out4, 64, data);
153   vec_st(out5, 80, data);
154   vec_st(out6, 96, data);
155   vec_st(out7, 112, data);
156 }
157