1 /* ***** BEGIN LICENSE BLOCK *****
2 *
3 * $Id: downconvert_mmx.cpp,v 1.2 2007/03/19 16:19:00 asuraparaju Exp $ $Name: Dirac_1_0_2 $
4 *
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 *
7 * The contents of this file are subject to the Mozilla Public License
8 * Version 1.1 (the "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
11 *
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
14 * the specific language governing rights and limitations under the License.
15 *
16 * The Original Code is BBC Research and Development code.
17 *
18 * The Initial Developer of the Original Code is the British Broadcasting
19 * Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 2004.
21 * All Rights Reserved.
22 *
23 * Contributor(s): Anuradha Suraparaju (Original Author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * the GNU General Public License Version 2 (the "GPL"), or the GNU Lesser
27 * Public License Version 2.1 (the "LGPL"), in which case the provisions of
28 * the GPL or the LGPL are applicable instead of those above. If you wish to
29 * allow use of your version of this file only under the terms of the either
30 * the GPL or LGPL and not to allow others to use your version of this file
31 * under the MPL, indicate your decision by deleting the provisions above
32 * and replace them with the notice and other provisions required by the GPL
33 * or LGPL. If you do not delete the provisions above, a recipient may use
34 * your version of this file under the terms of any one of the MPL, the GPL
35 * or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
37
38 #include <libdirac_motionest/downconvert.h>
39 using namespace dirac;
40
41 #if defined (HAVE_MMX)
42 #include <mmintrin.h>
43
44 typedef union
45 {
46 __m64 m;
47 int i[2];
48 } u_sum;
49
50
51 #define mmx_add(pic1,pic2,tap,zero,sum1,sum2) \
52 tmp = _mm_add_pi16 (*(__m64 *)pic1, *(__m64 *)pic2); \
53 m1 = _mm_unpacklo_pi16 ( tmp, zero); \
54 m2 = _mm_unpackhi_pi16 ( tmp, zero); \
55 m1 = _mm_madd_pi16 (m1, tap); \
56 m2 = _mm_madd_pi16 (m2, tap); \
57 *sum1 = _mm_add_pi32 (*sum1, m1); \
58 *sum2 = _mm_add_pi32 (*sum2, m2); \
59
60 //General function - does some admin and calls the correct function
DoDownConvert(const PicArray & old_data,PicArray & new_data)61 void DownConverter::DoDownConvert(const PicArray& old_data, PicArray& new_data)
62 {
63 //Down-convert by a factor of two.
64 m_row_buffer= new ValueType[old_data.LengthX()];
65 //Variables that will be used by the filter calculations
66 int sum;
67 int colpos;
68
69 // The area of the picture that will be downconverted
70 const int xlen = 2*new_data.LengthX();
71 const int ylen = 2*new_data.LengthY();
72
73
74 //There are three y loops to cope with the leading edge, middle
75 //and trailing edge of each column.
76 colpos=0;
77
78 static __m64 zero = _mm_set_pi16(0, 0, 0, 0);
79 static __m64 tap0 = _mm_set_pi16 (0, StageI_I, 0, StageI_I);
80 static __m64 tap1 = _mm_set_pi16 (0, StageI_II, 0, StageI_II);
81 static __m64 tap2 = _mm_set_pi16 (0, StageI_III, 0, StageI_III);
82 static __m64 tap3 = _mm_set_pi16 (0, StageI_IV, 0, StageI_IV);
83 static __m64 tap4 = _mm_set_pi16 (0, StageI_V, 0, StageI_V);
84 static __m64 tap5 = _mm_set_pi16 (0, StageI_VI, 0, StageI_VI);
85 static __m64 round = _mm_set_pi32 ( 1<<(StageI_Shift-1), 1<<(StageI_Shift-1));
86
87 u_sum sum1, sum2;
88 __m64 tmp, m1, m2;
89
90 int stopX = (xlen >> 2)<<2;
91 for( int y=0; y<Stage_I_Size*2 ; y+=2 , colpos++ )
92 {
93 // We are filtering each column but doing it bit by bit.
94 // This means our main loop is in the x direction and
95 // there is a much greater chance the data we need will
96 // be in the cache.
97
98 for( int x=0 ; x<stopX ; x+=4 )
99 {
100 // In down conversion we interpolate every pixel
101 // so there is no copying.
102 // Excuse the complicated ternary stuff but it sorts out the edge
103 sum1.m = _mm_set_pi32 (0, 0);
104 sum2.m = _mm_set_pi32 (0, 0);
105
106 mmx_add (&old_data[y][x], &old_data[y+1][x], tap0, zero, &sum1.m, &sum2.m);
107 mmx_add(&old_data[((y-1)>=0)?(y-1):0][x] , &old_data[y+2][x], tap1, zero, &sum1.m, &sum2.m);
108 mmx_add(&old_data[((y-2)>=0)?(y-2):0][x] , &old_data[y+3][x], tap2, zero, &sum1.m, &sum2.m);
109 mmx_add(&old_data[((y-3)>=0)?(y-3):0][x] , &old_data[y+4][x], tap3, zero, &sum1.m, &sum2.m);
110 mmx_add(&old_data[((y-4)>=0)?(y-4):0][x] , &old_data[y+5][x], tap4, zero, &sum1.m, &sum2.m);
111 mmx_add(&old_data[((y-5)>=0)?(y-5):0][x] , &old_data[y+6][x], tap5, zero, &sum1.m, &sum2.m);
112
113 sum1.m = _mm_add_pi32 (sum1.m, round);
114 sum2.m = _mm_add_pi32 (sum2.m, round);
115 sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
116 sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
117 m_row_buffer[x] = sum1.i[0];
118 m_row_buffer[x+1] = sum1.i[1];
119 m_row_buffer[x+2] = sum2.i[0];
120 m_row_buffer[x+3] = sum2.i[1];
121 }// x
122 _mm_empty();
123
124 for( int x=stopX ; x<xlen ; x++ )
125 {
126 // In down conversion we interpolate every pixel
127 // so there is no copying.
128 // Excuse the complicated ternary stuff but it sorts out the edge
129 sum = (old_data[y][x] + old_data[y+1][x])*StageI_I;
130 sum += (old_data[((y-1)>=0)?(y-1):0][x] + old_data[y+2][x])*StageI_II;
131 sum += (old_data[((y-2)>=0)?(y-2):0][x] + old_data[y+3][x])*StageI_III;
132 sum += (old_data[((y-3)>=0)?(y-3):0][x] + old_data[y+4][x])*StageI_IV;
133 sum += (old_data[((y-4)>=0)?(y-4):0][x] + old_data[y+5][x])*StageI_V;
134 sum += (old_data[((y-5)>=0)?(y-5):0][x] + old_data[y+6][x])*StageI_VI;
135 sum += 1<<(StageI_Shift-1);//do rounding right
136 m_row_buffer[x] = sum >> StageI_Shift;
137 }// x
138 //Speaking of which - the row loop.
139
140 RowLoop(colpos,new_data);
141 }// y
142
143 // This loop is like the last one but it deals with the center
144 // section of the image and so the ternary operations are dropped
145 // from the filter section.
146 for( int y=Stage_I_Size*2 ; y<ylen-Stage_I_Size*2 ; y+=2 , colpos++ )
147 {
148 for( int x=0 ; x<stopX ; x+=4 )
149 {
150 // In down conversion we interpolate every pixel
151 // so there is no copying.
152 // Excuse the complicated ternary stuff but it sorts out the edge
153 sum1.m = _mm_set_pi32 (0, 0);
154 sum2.m = _mm_set_pi32 (0, 0);
155
156 mmx_add (&old_data[y][x], &old_data[y+1][x], tap0, zero, &sum1.m, &sum2.m);
157 mmx_add(&old_data[y-1][x] , &old_data[y+2][x], tap1, zero, &sum1.m, &sum2.m);
158 mmx_add(&old_data[y-2][x] , &old_data[y+3][x], tap2, zero, &sum1.m, &sum2.m);
159 mmx_add(&old_data[y-3][x] , &old_data[y+4][x], tap3, zero, &sum1.m, &sum2.m);
160 mmx_add(&old_data[y-4][x] , &old_data[y+5][x], tap4, zero, &sum1.m, &sum2.m);
161 mmx_add(&old_data[y-5][x] , &old_data[y+6][x], tap5, zero, &sum1.m, &sum2.m);
162
163 sum1.m = _mm_add_pi32 (sum1.m, round);
164 sum2.m = _mm_add_pi32 (sum2.m, round);
165 sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
166 sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
167 m_row_buffer[x] = sum1.i[0];
168 m_row_buffer[x+1] = sum1.i[1];
169 m_row_buffer[x+2] = sum2.i[0];
170 m_row_buffer[x+3] = sum2.i[1];
171 }// x
172 _mm_empty();
173
174 for( int x=stopX ; x<xlen ; x++ )
175 {
176 sum = (old_data[y][x] + old_data[y+1][x])*StageI_I;
177 sum += (old_data[y-1][x] + old_data[y+2][x])*StageI_II;
178 sum += (old_data[y-2][x] + old_data[y+3][x])*StageI_III;
179 sum += (old_data[y-3][x] + old_data[y+4][x])*StageI_IV;
180 sum += (old_data[y-4][x] + old_data[y+5][x])*StageI_V;
181 sum += (old_data[y-5][x] + old_data[y+6][x])*StageI_VI;
182 sum += 1<<(StageI_Shift-1);//do rounding right
183 m_row_buffer[x] = sum >> StageI_Shift;
184 }// x
185
186 RowLoop( colpos , new_data );
187 }// y
188
189 // Another similar loop! - this time we are dealing with
190 // the trailing edge so the ternary stuff is back in the
191 // filter calcs but in the second parameter.
192
193 for( int y=ylen-(Stage_I_Size*2) ; y<ylen-1 ; y+=2 , colpos++ )
194 {
195 for( int x=0 ; x<stopX ; x+=4 )
196 {
197 // In down conversion we interpolate every pixel
198 // so there is no copying.
199 // Excuse the complicated ternary stuff but it sorts out the edge
200 sum1.m = _mm_set_pi32 (0, 0);
201 sum2.m = _mm_set_pi32 (0, 0);
202
203 mmx_add (&old_data[y][x], &old_data[((y+1)<ylen)?(y+1):(ylen-1)][x], tap0, zero, &sum1.m, &sum2.m);
204 mmx_add(&old_data[y-1][x] , &old_data[((y+2)<ylen)?(y+2):(ylen-1)][x], tap1, zero, &sum1.m, &sum2.m);
205 mmx_add(&old_data[y-2][x] , &old_data[((y+3)<ylen)?(y+3):(ylen-1)][x], tap2, zero, &sum1.m, &sum2.m);
206 mmx_add(&old_data[y-3][x] , &old_data[((y+4)<ylen)?(y+4):(ylen-1)][x], tap3, zero, &sum1.m, &sum2.m);
207 mmx_add(&old_data[y-4][x] , &old_data[((y+5)<ylen)?(y+5):(ylen-1)][x], tap4, zero, &sum1.m, &sum2.m);
208 mmx_add(&old_data[y-5][x] , &old_data[((y+6)<ylen)?(y+6):(ylen-1)][x], tap5, zero, &sum1.m, &sum2.m);
209
210 sum1.m = _mm_add_pi32 (sum1.m, round);
211 sum2.m = _mm_add_pi32 (sum2.m, round);
212 sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
213 sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
214
215 m_row_buffer[x] = sum1.i[0];
216 m_row_buffer[x+1] = sum1.i[1];
217 m_row_buffer[x+2] = sum2.i[0];
218 m_row_buffer[x+3] = sum2.i[1];
219 }// x
220 _mm_empty();
221
222 for( int x=stopX; x<xlen ; x++ )
223 {
224
225 sum = (old_data[y][x] + old_data[((y+1)<ylen)?(y+1):(ylen-1)][x])*StageI_I;
226 sum += (old_data[y-1][x] + old_data[((y+2)<ylen)?(y+2):(ylen-1)][x])*StageI_II;
227 sum += (old_data[y-2][x] + old_data[((y+3)<ylen)?(y+3):(ylen-1)][x])*StageI_III;
228 sum += (old_data[y-3][x] + old_data[((y+4)<ylen)?(y+4):(ylen-1)][x])*StageI_IV;
229 sum += (old_data[y-4][x] + old_data[((y+5)<ylen)?(y+5):(ylen-1)][x])*StageI_V;
230 sum += (old_data[y-5][x] + old_data[((y+6)<ylen)?(y+6):(ylen-1)][x])*StageI_VI;
231
232 // Do rounding right
233 sum += 1<<(StageI_Shift-1);
234 m_row_buffer[x] = sum >> StageI_Shift;
235
236 }// x
237
238 RowLoop( colpos , new_data );
239
240 }// y
241
242 // Tidy up the data
243 delete[] m_row_buffer;
244
245 }
246 #endif
247