1 /* ***** BEGIN LICENSE BLOCK *****
2 *
3 * $Id: downconvert_mmx.cpp,v 1.2 2007/03/19 16:19:00 asuraparaju Exp $ $Name: Dirac_1_0_2 $
4 *
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 *
7 * The contents of this file are subject to the Mozilla Public License
8 * Version 1.1 (the "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
11 *
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
14 * the specific language governing rights and limitations under the License.
15 *
16 * The Original Code is BBC Research and Development code.
17 *
18 * The Initial Developer of the Original Code is the British Broadcasting
19 * Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 2004.
21 * All Rights Reserved.
22 *
23 * Contributor(s): Anuradha Suraparaju (Original Author)
24 *
25 * Alternatively, the contents of this file may be used under the terms of
26 * the GNU General Public License Version 2 (the "GPL"), or the GNU Lesser
27 * Public License Version 2.1 (the "LGPL"), in which case the provisions of
28 * the GPL or the LGPL are applicable instead of those above. If you wish to
29 * allow use of your version of this file only under the terms of the either
30 * the GPL or LGPL and not to allow others to use your version of this file
31 * under the MPL, indicate your decision by deleting the provisions above
32 * and replace them with the notice and other provisions required by the GPL
33 * or LGPL. If you do not delete the provisions above, a recipient may use
34 * your version of this file under the terms of any one of the MPL, the GPL
35 * or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
37 
38 #include <libdirac_motionest/downconvert.h>
39 using namespace dirac;
40 
41 #if defined (HAVE_MMX)
42 #include <mmintrin.h>
43 
44 typedef union
45 {
46     __m64 m;
47     int i[2];
48 } u_sum;
49 
50 
51 #define mmx_add(pic1,pic2,tap,zero,sum1,sum2) \
52     tmp = _mm_add_pi16 (*(__m64 *)pic1, *(__m64 *)pic2);    \
53     m1 = _mm_unpacklo_pi16 ( tmp, zero);    \
54     m2 = _mm_unpackhi_pi16 ( tmp, zero);    \
55     m1 = _mm_madd_pi16 (m1, tap);    \
56     m2 = _mm_madd_pi16 (m2, tap);    \
57     *sum1 = _mm_add_pi32 (*sum1, m1);    \
58     *sum2 = _mm_add_pi32 (*sum2, m2);    \
59 
60 //General function - does some admin and calls the correct function
DoDownConvert(const PicArray & old_data,PicArray & new_data)61 void DownConverter::DoDownConvert(const PicArray& old_data, PicArray& new_data)
62 {
63     //Down-convert by a factor of two.
64     m_row_buffer= new ValueType[old_data.LengthX()];
65     //Variables that will be used by the filter calculations
66     int sum;
67     int colpos;
68 
69     // The area of the picture that will be downconverted
70     const int xlen = 2*new_data.LengthX();
71     const int ylen = 2*new_data.LengthY();
72 
73 
74     //There are three y loops to cope with the leading edge, middle
75     //and trailing edge of each column.
76     colpos=0;
77 
78     static __m64 zero = _mm_set_pi16(0, 0, 0, 0);
79     static __m64 tap0 = _mm_set_pi16 (0, StageI_I, 0, StageI_I);
80     static __m64 tap1 = _mm_set_pi16 (0, StageI_II, 0, StageI_II);
81     static __m64 tap2 = _mm_set_pi16 (0, StageI_III, 0, StageI_III);
82     static __m64 tap3 = _mm_set_pi16 (0, StageI_IV, 0, StageI_IV);
83     static __m64 tap4 = _mm_set_pi16 (0, StageI_V, 0, StageI_V);
84     static __m64 tap5 = _mm_set_pi16 (0, StageI_VI, 0, StageI_VI);
85     static __m64 round = _mm_set_pi32 ( 1<<(StageI_Shift-1), 1<<(StageI_Shift-1));
86 
87     u_sum sum1, sum2;
88     __m64 tmp, m1, m2;
89 
90     int stopX = (xlen >> 2)<<2;
91     for( int y=0; y<Stage_I_Size*2 ; y+=2 , colpos++ )
92     {
93         // We are filtering each column but doing it bit by bit.
94         // This means our main loop is in the x direction and
95         // there is a much greater chance the data we need will
96         // be in the cache.
97 
98         for( int x=0 ; x<stopX ; x+=4 )
99         {
100             // In down conversion we interpolate every pixel
101             // so there is no copying.
102             // Excuse the complicated ternary stuff but it sorts out the edge
103             sum1.m = _mm_set_pi32 (0, 0);
104             sum2.m = _mm_set_pi32 (0, 0);
105 
106             mmx_add (&old_data[y][x], &old_data[y+1][x], tap0, zero, &sum1.m, &sum2.m);
107             mmx_add(&old_data[((y-1)>=0)?(y-1):0][x] , &old_data[y+2][x], tap1, zero, &sum1.m, &sum2.m);
108             mmx_add(&old_data[((y-2)>=0)?(y-2):0][x] , &old_data[y+3][x], tap2, zero, &sum1.m, &sum2.m);
109             mmx_add(&old_data[((y-3)>=0)?(y-3):0][x] , &old_data[y+4][x], tap3, zero, &sum1.m, &sum2.m);
110             mmx_add(&old_data[((y-4)>=0)?(y-4):0][x] , &old_data[y+5][x], tap4, zero, &sum1.m, &sum2.m);
111             mmx_add(&old_data[((y-5)>=0)?(y-5):0][x] , &old_data[y+6][x], tap5, zero, &sum1.m, &sum2.m);
112 
113             sum1.m = _mm_add_pi32 (sum1.m, round);
114             sum2.m = _mm_add_pi32 (sum2.m, round);
115             sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
116             sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
117             m_row_buffer[x] = sum1.i[0];
118             m_row_buffer[x+1] = sum1.i[1];
119             m_row_buffer[x+2] = sum2.i[0];
120             m_row_buffer[x+3] = sum2.i[1];
121         }// x
122         _mm_empty();
123 
124         for( int x=stopX ; x<xlen ; x++ )
125         {
126             // In down conversion we interpolate every pixel
127             // so there is no copying.
128             // Excuse the complicated ternary stuff but it sorts out the edge
129             sum =  (old_data[y][x] + old_data[y+1][x])*StageI_I;
130             sum += (old_data[((y-1)>=0)?(y-1):0][x] + old_data[y+2][x])*StageI_II;
131             sum += (old_data[((y-2)>=0)?(y-2):0][x] + old_data[y+3][x])*StageI_III;
132             sum += (old_data[((y-3)>=0)?(y-3):0][x] + old_data[y+4][x])*StageI_IV;
133             sum += (old_data[((y-4)>=0)?(y-4):0][x] + old_data[y+5][x])*StageI_V;
134             sum += (old_data[((y-5)>=0)?(y-5):0][x] + old_data[y+6][x])*StageI_VI;
135             sum += 1<<(StageI_Shift-1);//do rounding right
136             m_row_buffer[x] = sum >> StageI_Shift;
137         }// x
138         //Speaking of which - the row loop.
139 
140         RowLoop(colpos,new_data);
141     }// y
142 
143     // This loop is like the last one but it deals with the center
144     // section of the image and so the ternary operations are dropped
145     // from the filter section.
146     for( int y=Stage_I_Size*2 ; y<ylen-Stage_I_Size*2 ; y+=2 , colpos++ )
147     {
148         for( int x=0 ; x<stopX ; x+=4 )
149         {
150             // In down conversion we interpolate every pixel
151             // so there is no copying.
152             // Excuse the complicated ternary stuff but it sorts out the edge
153             sum1.m = _mm_set_pi32 (0, 0);
154             sum2.m = _mm_set_pi32 (0, 0);
155 
156             mmx_add (&old_data[y][x], &old_data[y+1][x], tap0, zero, &sum1.m, &sum2.m);
157             mmx_add(&old_data[y-1][x] , &old_data[y+2][x], tap1, zero, &sum1.m, &sum2.m);
158             mmx_add(&old_data[y-2][x] , &old_data[y+3][x], tap2, zero, &sum1.m, &sum2.m);
159             mmx_add(&old_data[y-3][x] , &old_data[y+4][x], tap3, zero, &sum1.m, &sum2.m);
160             mmx_add(&old_data[y-4][x] , &old_data[y+5][x], tap4, zero, &sum1.m, &sum2.m);
161             mmx_add(&old_data[y-5][x] , &old_data[y+6][x], tap5, zero, &sum1.m, &sum2.m);
162 
163             sum1.m = _mm_add_pi32 (sum1.m, round);
164             sum2.m = _mm_add_pi32 (sum2.m, round);
165             sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
166             sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
167             m_row_buffer[x] = sum1.i[0];
168             m_row_buffer[x+1] = sum1.i[1];
169             m_row_buffer[x+2] = sum2.i[0];
170             m_row_buffer[x+3] = sum2.i[1];
171         }// x
172         _mm_empty();
173 
174         for( int x=stopX ; x<xlen ; x++ )
175         {
176             sum =  (old_data[y][x]   + old_data[y+1][x])*StageI_I;
177             sum += (old_data[y-1][x] + old_data[y+2][x])*StageI_II;
178             sum += (old_data[y-2][x] + old_data[y+3][x])*StageI_III;
179             sum += (old_data[y-3][x] + old_data[y+4][x])*StageI_IV;
180             sum += (old_data[y-4][x] + old_data[y+5][x])*StageI_V;
181             sum += (old_data[y-5][x] + old_data[y+6][x])*StageI_VI;
182             sum += 1<<(StageI_Shift-1);//do rounding right
183             m_row_buffer[x] = sum >> StageI_Shift;
184         }// x
185 
186         RowLoop( colpos , new_data );
187     }// y
188 
189     // Another similar loop! - this time we are dealing with
190     // the trailing edge so the ternary stuff is back in the
191     // filter calcs but in the second parameter.
192 
193     for( int y=ylen-(Stage_I_Size*2) ; y<ylen-1 ; y+=2 , colpos++ )
194     {
195         for( int x=0 ; x<stopX ; x+=4 )
196         {
197             // In down conversion we interpolate every pixel
198             // so there is no copying.
199             // Excuse the complicated ternary stuff but it sorts out the edge
200             sum1.m = _mm_set_pi32 (0, 0);
201             sum2.m = _mm_set_pi32 (0, 0);
202 
203             mmx_add (&old_data[y][x], &old_data[((y+1)<ylen)?(y+1):(ylen-1)][x], tap0, zero, &sum1.m, &sum2.m);
204             mmx_add(&old_data[y-1][x] , &old_data[((y+2)<ylen)?(y+2):(ylen-1)][x], tap1, zero, &sum1.m, &sum2.m);
205             mmx_add(&old_data[y-2][x] , &old_data[((y+3)<ylen)?(y+3):(ylen-1)][x], tap2, zero, &sum1.m, &sum2.m);
206             mmx_add(&old_data[y-3][x] , &old_data[((y+4)<ylen)?(y+4):(ylen-1)][x], tap3, zero, &sum1.m, &sum2.m);
207             mmx_add(&old_data[y-4][x] , &old_data[((y+5)<ylen)?(y+5):(ylen-1)][x], tap4, zero, &sum1.m, &sum2.m);
208             mmx_add(&old_data[y-5][x] , &old_data[((y+6)<ylen)?(y+6):(ylen-1)][x], tap5, zero, &sum1.m, &sum2.m);
209 
210             sum1.m = _mm_add_pi32 (sum1.m, round);
211             sum2.m = _mm_add_pi32 (sum2.m, round);
212             sum1.m = _mm_srai_pi32 (sum1.m, StageI_Shift);
213             sum2.m = _mm_srai_pi32 (sum2.m, StageI_Shift);
214 
215             m_row_buffer[x] = sum1.i[0];
216             m_row_buffer[x+1] = sum1.i[1];
217             m_row_buffer[x+2] = sum2.i[0];
218             m_row_buffer[x+3] = sum2.i[1];
219         }// x
220         _mm_empty();
221 
222         for( int x=stopX; x<xlen ; x++ )
223         {
224 
225             sum =  (old_data[y][x]   + old_data[((y+1)<ylen)?(y+1):(ylen-1)][x])*StageI_I;
226             sum += (old_data[y-1][x] + old_data[((y+2)<ylen)?(y+2):(ylen-1)][x])*StageI_II;
227             sum += (old_data[y-2][x] + old_data[((y+3)<ylen)?(y+3):(ylen-1)][x])*StageI_III;
228             sum += (old_data[y-3][x] + old_data[((y+4)<ylen)?(y+4):(ylen-1)][x])*StageI_IV;
229             sum += (old_data[y-4][x] + old_data[((y+5)<ylen)?(y+5):(ylen-1)][x])*StageI_V;
230             sum += (old_data[y-5][x] + old_data[((y+6)<ylen)?(y+6):(ylen-1)][x])*StageI_VI;
231 
232             // Do rounding right
233             sum += 1<<(StageI_Shift-1);
234             m_row_buffer[x] = sum >> StageI_Shift;
235 
236         }// x
237 
238         RowLoop( colpos , new_data );
239 
240     }//  y
241 
242     // Tidy up the data
243     delete[] m_row_buffer;
244 
245 }
246 #endif
247