1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14   last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
15 
16  ********************************************************************/
17 #include <stddef.h>
18 #include "x86enc.h"
19 
20 #if defined(OC_X86_ASM)
21 
oc_enc_frag_sad_mmxext(const unsigned char * _src,const unsigned char * _ref,int _ystride)22 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
23  const unsigned char *_ref,int _ystride){
24   ptrdiff_t ystride3;
25   ptrdiff_t ret;
26   __asm__ __volatile__(
27     /*Load the first 4 rows of each block.*/
28     "movq (%[src]),%%mm0\n\t"
29     "movq (%[ref]),%%mm1\n\t"
30     "movq (%[src],%[ystride]),%%mm2\n\t"
31     "movq (%[ref],%[ystride]),%%mm3\n\t"
32     "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
33     "movq (%[src],%[ystride],2),%%mm4\n\t"
34     "movq (%[ref],%[ystride],2),%%mm5\n\t"
35     "movq (%[src],%[ystride3]),%%mm6\n\t"
36     "movq (%[ref],%[ystride3]),%%mm7\n\t"
37     /*Compute their SADs and add them in %%mm0*/
38     "psadbw %%mm1,%%mm0\n\t"
39     "psadbw %%mm3,%%mm2\n\t"
40     "lea (%[src],%[ystride],4),%[src]\n\t"
41     "paddw %%mm2,%%mm0\n\t"
42     "lea (%[ref],%[ystride],4),%[ref]\n\t"
43     /*Load the next 3 rows as registers become available.*/
44     "movq (%[src]),%%mm2\n\t"
45     "movq (%[ref]),%%mm3\n\t"
46     "psadbw %%mm5,%%mm4\n\t"
47     "psadbw %%mm7,%%mm6\n\t"
48     "paddw %%mm4,%%mm0\n\t"
49     "movq (%[ref],%[ystride]),%%mm5\n\t"
50     "movq (%[src],%[ystride]),%%mm4\n\t"
51     "paddw %%mm6,%%mm0\n\t"
52     "movq (%[ref],%[ystride],2),%%mm7\n\t"
53     "movq (%[src],%[ystride],2),%%mm6\n\t"
54     /*Start adding their SADs to %%mm0*/
55     "psadbw %%mm3,%%mm2\n\t"
56     "psadbw %%mm5,%%mm4\n\t"
57     "paddw %%mm2,%%mm0\n\t"
58     "psadbw %%mm7,%%mm6\n\t"
59     /*Load last row as registers become available.*/
60     "movq (%[src],%[ystride3]),%%mm2\n\t"
61     "movq (%[ref],%[ystride3]),%%mm3\n\t"
62     /*And finish adding up their SADs.*/
63     "paddw %%mm4,%%mm0\n\t"
64     "psadbw %%mm3,%%mm2\n\t"
65     "paddw %%mm6,%%mm0\n\t"
66     "paddw %%mm2,%%mm0\n\t"
67     "movd %%mm0,%[ret]\n\t"
68     :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
69     :[ystride]"r"((ptrdiff_t)_ystride)
70   );
71   return (unsigned)ret;
72 }
73 
oc_enc_frag_sad_thresh_mmxext(const unsigned char * _src,const unsigned char * _ref,int _ystride,unsigned _thresh)74 unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
75  const unsigned char *_ref,int _ystride,unsigned _thresh){
76   /*Early termination is for suckers.*/
77   return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
78 }
79 
80 /*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
81    first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
82   We pre-load the next two rows of data as registers become available.*/
83 #define OC_SAD2_LOOP \
84  "#OC_SAD2_LOOP\n\t" \
85  /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
86     pavgb computes (%%mm0+%%mm1+1>>1). \
87    The latter is exactly 1 too large when the low bit of two corresponding \
88     bytes is only set in one of them. \
89    Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
90     correct the output of pavgb.*/ \
91  "movq %%mm0,%%mm6\n\t" \
92  "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
93  "pxor %%mm1,%%mm0\n\t" \
94  "pavgb %%mm1,%%mm6\n\t" \
95  "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
96  "movq %%mm2,%%mm1\n\t" \
97  "pand %%mm7,%%mm0\n\t" \
98  "pavgb %%mm3,%%mm2\n\t" \
99  "pxor %%mm3,%%mm1\n\t" \
100  "movq (%[ref2],%[ystride]),%%mm3\n\t" \
101  "psubb %%mm0,%%mm6\n\t" \
102  "movq (%[ref1]),%%mm0\n\t" \
103  "pand %%mm7,%%mm1\n\t" \
104  "psadbw %%mm6,%%mm4\n\t" \
105  "movd %[ret],%%mm6\n\t" \
106  "psubb %%mm1,%%mm2\n\t" \
107  "movq (%[ref2]),%%mm1\n\t" \
108  "lea (%[src],%[ystride],2),%[src]\n\t" \
109  "psadbw %%mm2,%%mm5\n\t" \
110  "movq (%[ref1],%[ystride]),%%mm2\n\t" \
111  "paddw %%mm4,%%mm5\n\t" \
112  "movq (%[src]),%%mm4\n\t" \
113  "paddw %%mm5,%%mm6\n\t" \
114  "movq (%[src],%[ystride]),%%mm5\n\t" \
115  "movd %%mm6,%[ret]\n\t" \
116 
117 /*Same as above, but does not pre-load the next two rows.*/
118 #define OC_SAD2_TAIL \
119  "#OC_SAD2_TAIL\n\t" \
120  "movq %%mm0,%%mm6\n\t" \
121  "pavgb %%mm1,%%mm0\n\t" \
122  "pxor %%mm1,%%mm6\n\t" \
123  "movq %%mm2,%%mm1\n\t" \
124  "pand %%mm7,%%mm6\n\t" \
125  "pavgb %%mm3,%%mm2\n\t" \
126  "pxor %%mm3,%%mm1\n\t" \
127  "psubb %%mm6,%%mm0\n\t" \
128  "pand %%mm7,%%mm1\n\t" \
129  "psadbw %%mm0,%%mm4\n\t" \
130  "psubb %%mm1,%%mm2\n\t" \
131  "movd %[ret],%%mm6\n\t" \
132  "psadbw %%mm2,%%mm5\n\t" \
133  "paddw %%mm4,%%mm5\n\t" \
134  "paddw %%mm5,%%mm6\n\t" \
135  "movd %%mm6,%[ret]\n\t" \
136 
oc_enc_frag_sad2_thresh_mmxext(const unsigned char * _src,const unsigned char * _ref1,const unsigned char * _ref2,int _ystride,unsigned _thresh)137 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
138  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
139  unsigned _thresh){
140   ptrdiff_t ret;
141   __asm__ __volatile__(
142     "movq (%[ref1]),%%mm0\n\t"
143     "movq (%[ref2]),%%mm1\n\t"
144     "movq (%[ref1],%[ystride]),%%mm2\n\t"
145     "movq (%[ref2],%[ystride]),%%mm3\n\t"
146     "xor %[ret],%[ret]\n\t"
147     "movq (%[src]),%%mm4\n\t"
148     "pxor %%mm7,%%mm7\n\t"
149     "pcmpeqb %%mm6,%%mm6\n\t"
150     "movq (%[src],%[ystride]),%%mm5\n\t"
151     "psubb %%mm6,%%mm7\n\t"
152     OC_SAD2_LOOP
153     OC_SAD2_LOOP
154     OC_SAD2_LOOP
155     OC_SAD2_TAIL
156     :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
157     :[ystride]"r"((ptrdiff_t)_ystride)
158   );
159   return (unsigned)ret;
160 }
161 
162 /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
163    16-bit difference in %%mm0...%%mm7.*/
164 #define OC_LOAD_SUB_8x4(_off) \
165  "#OC_LOAD_SUB_8x4\n\t" \
166  "movd "_off"(%[src]),%%mm0\n\t" \
167  "movd "_off"(%[ref]),%%mm4\n\t" \
168  "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
169  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
170  "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
171  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
172  "movd "_off"(%[src]),%%mm2\n\t" \
173  "movd "_off"(%[ref]),%%mm7\n\t" \
174  "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
175  "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
176  "punpcklbw %%mm4,%%mm0\n\t" \
177  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
178  "punpcklbw %%mm4,%%mm4\n\t" \
179  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
180  "psubw %%mm4,%%mm0\n\t" \
181  "movd "_off"(%[src]),%%mm4\n\t" \
182  "movq %%mm0,"_off"*2(%[buf])\n\t" \
183  "movd "_off"(%[ref]),%%mm0\n\t" \
184  "punpcklbw %%mm5,%%mm1\n\t" \
185  "punpcklbw %%mm5,%%mm5\n\t" \
186  "psubw %%mm5,%%mm1\n\t" \
187  "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
188  "punpcklbw %%mm7,%%mm2\n\t" \
189  "punpcklbw %%mm7,%%mm7\n\t" \
190  "psubw %%mm7,%%mm2\n\t" \
191  "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
192  "punpcklbw %%mm6,%%mm3\n\t" \
193  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
194  "punpcklbw %%mm6,%%mm6\n\t" \
195  "psubw %%mm6,%%mm3\n\t" \
196  "movd "_off"(%[src]),%%mm6\n\t" \
197  "punpcklbw %%mm0,%%mm4\n\t" \
198  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
199  "punpcklbw %%mm0,%%mm0\n\t" \
200  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
201  "psubw %%mm0,%%mm4\n\t" \
202  "movd "_off"(%[ref]),%%mm0\n\t" \
203  "punpcklbw %%mm7,%%mm5\n\t" \
204  "neg %[src_ystride]\n\t" \
205  "punpcklbw %%mm7,%%mm7\n\t" \
206  "psubw %%mm7,%%mm5\n\t" \
207  "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
208  "punpcklbw %%mm0,%%mm6\n\t" \
209  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
210  "punpcklbw %%mm0,%%mm0\n\t" \
211  "neg %[ref_ystride]\n\t" \
212  "psubw %%mm0,%%mm6\n\t" \
213  "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
214  "lea (%[src],%[src_ystride],8),%[src]\n\t" \
215  "punpcklbw %%mm0,%%mm7\n\t" \
216  "neg %[src_ystride]\n\t" \
217  "punpcklbw %%mm0,%%mm0\n\t" \
218  "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
219  "psubw %%mm0,%%mm7\n\t" \
220  "neg %[ref_ystride]\n\t" \
221  "movq "_off"*2(%[buf]),%%mm0\n\t" \
222 
223 /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
224 #define OC_LOAD_8x4(_off) \
225  "#OC_LOAD_8x4\n\t" \
226  "movd "_off"(%[src]),%%mm0\n\t" \
227  "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
228  "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
229  "pxor %%mm7,%%mm7\n\t" \
230  "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
231  "punpcklbw %%mm7,%%mm0\n\t" \
232  "movd "_off"(%[src4]),%%mm4\n\t" \
233  "punpcklbw %%mm7,%%mm1\n\t" \
234  "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
235  "punpcklbw %%mm7,%%mm2\n\t" \
236  "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
237  "punpcklbw %%mm7,%%mm3\n\t" \
238  "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
239  "punpcklbw %%mm4,%%mm4\n\t" \
240  "punpcklbw %%mm5,%%mm5\n\t" \
241  "psrlw $8,%%mm4\n\t" \
242  "psrlw $8,%%mm5\n\t" \
243  "punpcklbw %%mm6,%%mm6\n\t" \
244  "punpcklbw %%mm7,%%mm7\n\t" \
245  "psrlw $8,%%mm6\n\t" \
246  "psrlw $8,%%mm7\n\t" \
247 
248 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
249   The transform is performed in place, except that outputs 0-3 are swapped with
250    outputs 4-7.
251   Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
252    perform this stage in place with no temporary registers).*/
253 #define OC_HADAMARD_AB_8x4 \
254  "#OC_HADAMARD_AB_8x4\n\t" \
255  /*Stage A: \
256    Outputs 0-3 are swapped with 4-7 here.*/ \
257  "paddw %%mm1,%%mm5\n\t" \
258  "paddw %%mm2,%%mm6\n\t" \
259  "paddw %%mm1,%%mm1\n\t" \
260  "paddw %%mm2,%%mm2\n\t" \
261  "psubw %%mm5,%%mm1\n\t" \
262  "psubw %%mm6,%%mm2\n\t" \
263  "paddw %%mm3,%%mm7\n\t" \
264  "paddw %%mm0,%%mm4\n\t" \
265  "paddw %%mm3,%%mm3\n\t" \
266  "paddw %%mm0,%%mm0\n\t" \
267  "psubw %%mm7,%%mm3\n\t" \
268  "psubw %%mm4,%%mm0\n\t" \
269  /*Stage B:*/ \
270  "paddw %%mm2,%%mm0\n\t" \
271  "paddw %%mm3,%%mm1\n\t" \
272  "paddw %%mm6,%%mm4\n\t" \
273  "paddw %%mm7,%%mm5\n\t" \
274  "paddw %%mm2,%%mm2\n\t" \
275  "paddw %%mm3,%%mm3\n\t" \
276  "paddw %%mm6,%%mm6\n\t" \
277  "paddw %%mm7,%%mm7\n\t" \
278  "psubw %%mm0,%%mm2\n\t" \
279  "psubw %%mm1,%%mm3\n\t" \
280  "psubw %%mm4,%%mm6\n\t" \
281  "psubw %%mm5,%%mm7\n\t" \
282 
283 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
284   Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
285    place with no temporary registers).*/
286 #define OC_HADAMARD_C_8x4 \
287  "#OC_HADAMARD_C_8x4\n\t" \
288  /*Stage C:*/ \
289  "paddw %%mm1,%%mm0\n\t" \
290  "paddw %%mm3,%%mm2\n\t" \
291  "paddw %%mm5,%%mm4\n\t" \
292  "paddw %%mm7,%%mm6\n\t" \
293  "paddw %%mm1,%%mm1\n\t" \
294  "paddw %%mm3,%%mm3\n\t" \
295  "paddw %%mm5,%%mm5\n\t" \
296  "paddw %%mm7,%%mm7\n\t" \
297  "psubw %%mm0,%%mm1\n\t" \
298  "psubw %%mm2,%%mm3\n\t" \
299  "psubw %%mm4,%%mm5\n\t" \
300  "psubw %%mm6,%%mm7\n\t" \
301 
302 /*Performs an 8-point 1-D Hadamard transform.
303   The transform is performed in place, except that outputs 0-3 are swapped with
304    outputs 4-7.
305   Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
306    in place with no temporary registers).*/
307 #define OC_HADAMARD_8x4 \
308  OC_HADAMARD_AB_8x4 \
309  OC_HADAMARD_C_8x4 \
310 
311 /*Performs the first part of the final stage of the Hadamard transform and
312    summing of absolute values.
313   At the end of this part, %%mm1 will contain the DC coefficient of the
314    transform.*/
315 #define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
316  /*We use the fact that \
317      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
318     to merge the final butterfly with the abs and the first stage of \
319     accumulation. \
320    Thus we can avoid using pabsw, which is not available until SSSE3. \
321    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
322     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
323     registers). \
324    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
325    This implementation is only 26 (+4 for spilling registers).*/ \
326  "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
327  "movq %%mm7,"_r7"(%[buf])\n\t" \
328  "movq %%mm6,"_r6"(%[buf])\n\t" \
329  /*mm7={0x7FFF}x4 \
330    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
331  "pcmpeqb %%mm7,%%mm7\n\t" \
332  "movq %%mm0,%%mm6\n\t" \
333  "psrlw $1,%%mm7\n\t" \
334  "paddw %%mm1,%%mm6\n\t" \
335  "pmaxsw %%mm1,%%mm0\n\t" \
336  "paddsw %%mm7,%%mm6\n\t" \
337  "psubw %%mm6,%%mm0\n\t" \
338  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
339    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
340  "movq %%mm2,%%mm6\n\t" \
341  "movq %%mm4,%%mm1\n\t" \
342  "pmaxsw %%mm3,%%mm2\n\t" \
343  "pmaxsw %%mm5,%%mm4\n\t" \
344  "paddw %%mm3,%%mm6\n\t" \
345  "paddw %%mm5,%%mm1\n\t" \
346  "movq "_r7"(%[buf]),%%mm3\n\t" \
347 
348 /*Performs the second part of the final stage of the Hadamard transform and
349    summing of absolute values.*/
350 #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
351  "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
352  "paddsw %%mm7,%%mm6\n\t" \
353  "movq "_r6"(%[buf]),%%mm5\n\t" \
354  "paddsw %%mm7,%%mm1\n\t" \
355  "psubw %%mm6,%%mm2\n\t" \
356  "psubw %%mm1,%%mm4\n\t" \
357  /*mm7={1}x4 (needed for the horizontal add that follows) \
358    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
359  "movq %%mm3,%%mm6\n\t" \
360  "pmaxsw %%mm5,%%mm3\n\t" \
361  "paddw %%mm2,%%mm0\n\t" \
362  "paddw %%mm5,%%mm6\n\t" \
363  "paddw %%mm4,%%mm0\n\t" \
364  "paddsw %%mm7,%%mm6\n\t" \
365  "paddw %%mm3,%%mm0\n\t" \
366  "psrlw $14,%%mm7\n\t" \
367  "psubw %%mm6,%%mm0\n\t" \
368 
369 /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
370    absolute value of each component, and accumulates everything into mm0.
371   This is the only portion of SATD which requires MMXEXT (we could use plain
372    MMX, but it takes 4 instructions and an extra register to work around the
373    lack of a pmaxsw, which is a pretty serious penalty).*/
374 #define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
375  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
376  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
377 
378 /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
379    component, and accumulates everything into mm0.
380   Note that mm0 will have an extra 4 added to each column, and that after
381    removing this value, the remainder will be half the conventional value.*/
382 #define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
383  OC_HADAMARD_AB_8x4 \
384  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
385 
386 /*Performs two 4x4 transposes (mostly) in place.
387   On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
388    contains rows {a,b,c,d}.
389   On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
390    {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
391 #define OC_TRANSPOSE_4x4x2(_off) \
392  "#OC_TRANSPOSE_4x4x2\n\t" \
393  /*First 4x4 transpose:*/ \
394  "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
395  /*mm0 = e3 e2 e1 e0 \
396    mm1 = f3 f2 f1 f0 \
397    mm2 = g3 g2 g1 g0 \
398    mm3 = h3 h2 h1 h0*/ \
399  "movq %%mm2,%%mm5\n\t" \
400  "punpcklwd %%mm3,%%mm2\n\t" \
401  "punpckhwd %%mm3,%%mm5\n\t" \
402  "movq %%mm0,%%mm3\n\t" \
403  "punpcklwd %%mm1,%%mm0\n\t" \
404  "punpckhwd %%mm1,%%mm3\n\t" \
405  /*mm0 = f1 e1 f0 e0 \
406    mm3 = f3 e3 f2 e2 \
407    mm2 = h1 g1 h0 g0 \
408    mm5 = h3 g3 h2 g2*/ \
409  "movq %%mm0,%%mm1\n\t" \
410  "punpckldq %%mm2,%%mm0\n\t" \
411  "punpckhdq %%mm2,%%mm1\n\t" \
412  "movq %%mm3,%%mm2\n\t" \
413  "punpckhdq %%mm5,%%mm3\n\t" \
414  "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
415  "punpckldq %%mm5,%%mm2\n\t" \
416  /*mm0 = h0 g0 f0 e0 \
417    mm1 = h1 g1 f1 e1 \
418    mm2 = h2 g2 f2 e2 \
419    mm3 = h3 g3 f3 e3*/ \
420  "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
421  /*Second 4x4 transpose:*/ \
422  /*mm4 = a3 a2 a1 a0 \
423    mm5 = b3 b2 b1 b0 \
424    mm6 = c3 c2 c1 c0 \
425    mm7 = d3 d2 d1 d0*/ \
426  "movq %%mm6,%%mm0\n\t" \
427  "punpcklwd %%mm7,%%mm6\n\t" \
428  "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
429  "punpckhwd %%mm7,%%mm0\n\t" \
430  "movq %%mm4,%%mm7\n\t" \
431  "punpcklwd %%mm5,%%mm4\n\t" \
432  "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
433  "punpckhwd %%mm5,%%mm7\n\t" \
434  /*mm4 = b1 a1 b0 a0 \
435    mm7 = b3 a3 b2 a2 \
436    mm6 = d1 c1 d0 c0 \
437    mm0 = d3 c3 d2 c2*/ \
438  "movq %%mm4,%%mm5\n\t" \
439  "punpckldq %%mm6,%%mm4\n\t" \
440  "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
441  "punpckhdq %%mm6,%%mm5\n\t" \
442  "movq %%mm7,%%mm6\n\t" \
443  "punpckhdq %%mm0,%%mm7\n\t" \
444  "punpckldq %%mm0,%%mm6\n\t" \
445  /*mm4 = d0 c0 b0 a0 \
446    mm5 = d1 c1 b1 a1 \
447    mm6 = d2 c2 b2 a2 \
448    mm7 = d3 c3 b3 a3*/ \
449 
oc_int_frag_satd_thresh_mmxext(const unsigned char * _src,int _src_ystride,const unsigned char * _ref,int _ref_ystride,unsigned _thresh)450 static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
451  int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
452   OC_ALIGN8(ogg_int16_t  buf[64]);
453   ogg_int16_t *bufp;
454   unsigned     ret;
455   unsigned     ret2;
456   bufp=buf;
457   __asm__ __volatile__(
458     OC_LOAD_SUB_8x4("0x00")
459     OC_HADAMARD_8x4
460     OC_TRANSPOSE_4x4x2("0x00")
461     /*Finish swapping out this 8x4 block to make room for the next one.
462       mm0...mm3 have been swapped out already.*/
463     "movq %%mm4,0x00(%[buf])\n\t"
464     "movq %%mm5,0x10(%[buf])\n\t"
465     "movq %%mm6,0x20(%[buf])\n\t"
466     "movq %%mm7,0x30(%[buf])\n\t"
467     OC_LOAD_SUB_8x4("0x04")
468     OC_HADAMARD_8x4
469     OC_TRANSPOSE_4x4x2("0x08")
470     /*Here the first 4x4 block of output from the last transpose is the second
471        4x4 block of input for the next transform.
472       We have cleverly arranged that it already be in the appropriate place, so
473        we only have to do half the loads.*/
474     "movq 0x10(%[buf]),%%mm1\n\t"
475     "movq 0x20(%[buf]),%%mm2\n\t"
476     "movq 0x30(%[buf]),%%mm3\n\t"
477     "movq 0x00(%[buf]),%%mm0\n\t"
478     OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
479     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
480        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
481        for the factor of two we dropped + 3 for the vertical accumulation).
482       Now we finally have to promote things to dwords.
483       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
484        latency of pmaddwd by starting the next series of loads now.*/
485     "mov %[thresh],%[ret2]\n\t"
486     "pmaddwd %%mm7,%%mm0\n\t"
487     "movq 0x50(%[buf]),%%mm1\n\t"
488     "movq 0x58(%[buf]),%%mm5\n\t"
489     "movq %%mm0,%%mm4\n\t"
490     "movq 0x60(%[buf]),%%mm2\n\t"
491     "punpckhdq %%mm0,%%mm0\n\t"
492     "movq 0x68(%[buf]),%%mm6\n\t"
493     "paddd %%mm0,%%mm4\n\t"
494     "movq 0x70(%[buf]),%%mm3\n\t"
495     "movd %%mm4,%[ret]\n\t"
496     "movq 0x78(%[buf]),%%mm7\n\t"
497     /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
498        added to them, and a factor of two removed; correct the final sum here.*/
499     "lea -32(%[ret],%[ret]),%[ret]\n\t"
500     "movq 0x40(%[buf]),%%mm0\n\t"
501     "cmp %[ret2],%[ret]\n\t"
502     "movq 0x48(%[buf]),%%mm4\n\t"
503     "jae 1f\n\t"
504     OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
505     "pmaddwd %%mm7,%%mm0\n\t"
506     /*There isn't much to stick in here to hide the latency this time, but the
507        alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
508        latency is even worse.*/
509     "sub $32,%[ret]\n\t"
510     "movq %%mm0,%%mm4\n\t"
511     "punpckhdq %%mm0,%%mm0\n\t"
512     "paddd %%mm0,%%mm4\n\t"
513     "movd %%mm4,%[ret2]\n\t"
514     "lea (%[ret],%[ret2],2),%[ret]\n\t"
515     ".p2align 4,,15\n\t"
516     "1:\n\t"
517     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
518        and %[ret2] with some of the inputs, since for once we don't write to
519        them until after we're done using everything but %[buf] (which is also
520        listed as an output to ensure gcc _doesn't_ alias them against it).*/
521     /*Note that _src_ystride and _ref_ystride must be given non-overlapping
522        constraints, otherewise if gcc can prove they're equal it will allocate
523        them to the same register (which is bad); _src and _ref face a similar
524        problem, though those are never actually the same.*/
525     :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
526     :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
527      [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
528      [thresh]"m"(_thresh)
529     /*We have to use neg, so we actually clobber the condition codes for once
530        (not to mention cmp, sub, and add).*/
531     :"cc"
532   );
533   return ret;
534 }
535 
oc_enc_frag_satd_thresh_mmxext(const unsigned char * _src,const unsigned char * _ref,int _ystride,unsigned _thresh)536 unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
537  const unsigned char *_ref,int _ystride,unsigned _thresh){
538   return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
539 }
540 
541 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
542    we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
oc_int_frag_copy2_mmxext(unsigned char * _dst,int _dst_ystride,const unsigned char * _src1,const unsigned char * _src2,int _src_ystride)543 static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
544  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
545   __asm__ __volatile__(
546     /*Load the first 3 rows.*/
547     "movq (%[src1]),%%mm0\n\t"
548     "movq (%[src2]),%%mm1\n\t"
549     "movq (%[src1],%[src_ystride]),%%mm2\n\t"
550     "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
551     "movq (%[src2],%[src_ystride]),%%mm3\n\t"
552     "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
553     "pxor %%mm7,%%mm7\n\t"
554     "movq (%[src1]),%%mm4\n\t"
555     "pcmpeqb %%mm6,%%mm6\n\t"
556     "movq (%[src2]),%%mm5\n\t"
557     /*mm7={1}x8.*/
558     "psubb %%mm6,%%mm7\n\t"
559     /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
560     "movq %%mm0,%%mm6\n\t"
561     "pxor %%mm1,%%mm0\n\t"
562     "pavgb %%mm1,%%mm6\n\t"
563     /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
564     "movq %%mm2,%%mm1\n\t"
565     "pand %%mm7,%%mm0\n\t"
566     "pavgb %%mm3,%%mm2\n\t"
567     "pxor %%mm3,%%mm1\n\t"
568     /*%%mm3 is free.*/
569     "psubb %%mm0,%%mm6\n\t"
570     /*%%mm0 is free, start loading the next row.*/
571     "movq (%[src1],%[src_ystride]),%%mm0\n\t"
572     /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
573     "movq %%mm4,%%mm3\n\t"
574     /*%%mm6 (row 0) is done; write it out.*/
575     "movq %%mm6,(%[dst])\n\t"
576     "pand %%mm7,%%mm1\n\t"
577     "pavgb %%mm5,%%mm4\n\t"
578     "psubb %%mm1,%%mm2\n\t"
579     /*%%mm1 is free, continue loading the next row.*/
580     "movq (%[src2],%[src_ystride]),%%mm1\n\t"
581     "pxor %%mm5,%%mm3\n\t"
582     "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
583     /*%%mm2 (row 1) is done; write it out.*/
584     "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
585     "pand %%mm7,%%mm3\n\t"
586     /*Start loading the next row.*/
587     "movq (%[src1]),%%mm2\n\t"
588     "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
589     "psubb %%mm3,%%mm4\n\t"
590     "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
591     /*%%mm4 (row 2) is done; write it out.*/
592     "movq %%mm4,(%[dst])\n\t"
593     /*Continue loading the next row.*/
594     "movq (%[src2]),%%mm3\n\t"
595     /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
596     "movq %%mm0,%%mm6\n\t"
597     "pxor %%mm1,%%mm0\n\t"
598     /*Start loading the next row.*/
599     "movq (%[src1],%[src_ystride]),%%mm4\n\t"
600     "pavgb %%mm1,%%mm6\n\t"
601     /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
602     "movq %%mm2,%%mm1\n\t"
603     "pand %%mm7,%%mm0\n\t"
604     /*Continue loading the next row.*/
605     "movq (%[src2],%[src_ystride]),%%mm5\n\t"
606     "pavgb %%mm3,%%mm2\n\t"
607     "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
608     "pxor %%mm3,%%mm1\n\t"
609     /*%%mm3 is free.*/
610     "psubb %%mm0,%%mm6\n\t"
611     /*%%mm0 is free, start loading the next row.*/
612     "movq (%[src1]),%%mm0\n\t"
613     /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
614     "movq %%mm4,%%mm3\n\t"
615     /*%%mm6 (row 3) is done; write it out.*/
616     "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
617     "pand %%mm7,%%mm1\n\t"
618     "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
619     "pavgb %%mm5,%%mm4\n\t"
620     "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
621     "psubb %%mm1,%%mm2\n\t"
622     /*%%mm1 is free; continue loading the next row.*/
623     "movq (%[src2]),%%mm1\n\t"
624     "pxor %%mm5,%%mm3\n\t"
625     /*%%mm2 (row 4) is done; write it out.*/
626     "movq %%mm2,(%[dst])\n\t"
627     "pand %%mm7,%%mm3\n\t"
628     /*Start loading the next row.*/
629     "movq (%[src1],%[src_ystride]),%%mm2\n\t"
630     "psubb %%mm3,%%mm4\n\t"
631     /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
632     "movq %%mm0,%%mm6\n\t"
633     /*Continue loading the next row.*/
634     "movq (%[src2],%[src_ystride]),%%mm3\n\t"
635     /*%%mm4 (row 5) is done; write it out.*/
636     "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
637     "pxor %%mm1,%%mm0\n\t"
638     "pavgb %%mm1,%%mm6\n\t"
639     /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
640     "movq %%mm2,%%mm4\n\t"
641     "pand %%mm7,%%mm0\n\t"
642     "pavgb %%mm3,%%mm2\n\t"
643     "pxor %%mm3,%%mm4\n\t"
644     "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
645     "psubb %%mm0,%%mm6\n\t"
646     "pand %%mm7,%%mm4\n\t"
647     /*%%mm6 (row 6) is done, write it out.*/
648     "movq %%mm6,(%[dst])\n\t"
649     "psubb %%mm4,%%mm2\n\t"
650     /*%%mm2 (row 7) is done, write it out.*/
651     "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
652     :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
653     :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
654      [src_ystride]"r"((ptrdiff_t)_src_ystride)
655     :"memory"
656   );
657 }
658 
oc_enc_frag_satd2_thresh_mmxext(const unsigned char * _src,const unsigned char * _ref1,const unsigned char * _ref2,int _ystride,unsigned _thresh)659 unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
660  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
661  unsigned _thresh){
662   OC_ALIGN8(unsigned char ref[64]);
663   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
664   return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
665 }
666 
oc_enc_frag_intra_satd_mmxext(const unsigned char * _src,int _ystride)667 unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
668  int _ystride){
669   OC_ALIGN8(ogg_int16_t  buf[64]);
670   ogg_int16_t *bufp;
671   unsigned     ret;
672   unsigned     ret2;
673   bufp=buf;
674   __asm__ __volatile__(
675     OC_LOAD_8x4("0x00")
676     OC_HADAMARD_8x4
677     OC_TRANSPOSE_4x4x2("0x00")
678     /*Finish swapping out this 8x4 block to make room for the next one.
679       mm0...mm3 have been swapped out already.*/
680     "movq %%mm4,0x00(%[buf])\n\t"
681     "movq %%mm5,0x10(%[buf])\n\t"
682     "movq %%mm6,0x20(%[buf])\n\t"
683     "movq %%mm7,0x30(%[buf])\n\t"
684     OC_LOAD_8x4("0x04")
685     OC_HADAMARD_8x4
686     OC_TRANSPOSE_4x4x2("0x08")
687     /*Here the first 4x4 block of output from the last transpose is the second
688        4x4 block of input for the next transform.
689       We have cleverly arranged that it already be in the appropriate place, so
690        we only have to do half the loads.*/
691     "movq 0x10(%[buf]),%%mm1\n\t"
692     "movq 0x20(%[buf]),%%mm2\n\t"
693     "movq 0x30(%[buf]),%%mm3\n\t"
694     "movq 0x00(%[buf]),%%mm0\n\t"
695     /*We split out the stages here so we can save the DC coefficient in the
696        middle.*/
697     OC_HADAMARD_AB_8x4
698     OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
699     "movd %%mm1,%[ret]\n\t"
700     OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
701     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
702        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
703        for the factor of two we dropped + 3 for the vertical accumulation).
704       Now we finally have to promote things to dwords.
705       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
706        latency of pmaddwd by starting the next series of loads now.*/
707     "pmaddwd %%mm7,%%mm0\n\t"
708     "movq 0x50(%[buf]),%%mm1\n\t"
709     "movq 0x58(%[buf]),%%mm5\n\t"
710     "movq 0x60(%[buf]),%%mm2\n\t"
711     "movq %%mm0,%%mm4\n\t"
712     "movq 0x68(%[buf]),%%mm6\n\t"
713     "punpckhdq %%mm0,%%mm0\n\t"
714     "movq 0x70(%[buf]),%%mm3\n\t"
715     "paddd %%mm0,%%mm4\n\t"
716     "movq 0x78(%[buf]),%%mm7\n\t"
717     "movd %%mm4,%[ret2]\n\t"
718     "movq 0x40(%[buf]),%%mm0\n\t"
719     "movq 0x48(%[buf]),%%mm4\n\t"
720     OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
721     "pmaddwd %%mm7,%%mm0\n\t"
722     /*We assume that the DC coefficient is always positive (which is true,
723        because the input to the INTRA transform was not a difference).*/
724     "movzx %w[ret],%[ret]\n\t"
725     "add %[ret2],%[ret2]\n\t"
726     "sub %[ret],%[ret2]\n\t"
727     "movq %%mm0,%%mm4\n\t"
728     "punpckhdq %%mm0,%%mm0\n\t"
729     "paddd %%mm0,%%mm4\n\t"
730     "movd %%mm4,%[ret]\n\t"
731     "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
732     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
733        and %[ret2] with some of the inputs, since for once we don't write to
734        them until after we're done using everything but %[buf] (which is also
735        listed as an output to ensure gcc _doesn't_ alias them against it).*/
736     :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
737     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
738      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
739     /*We have to use sub, so we actually clobber the condition codes for once
740        (not to mention add).*/
741     :"cc"
742   );
743   return ret;
744 }
745 
oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],const unsigned char * _src,const unsigned char * _ref,int _ystride)746 void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
747  const unsigned char *_src,const unsigned char *_ref,int _ystride){
748   int i;
749   __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
750   for(i=4;i-->0;){
751     __asm__ __volatile__(
752       /*mm0=[src]*/
753       "movq (%[src]),%%mm0\n\t"
754       /*mm1=[ref]*/
755       "movq (%[ref]),%%mm1\n\t"
756       /*mm4=[src+ystride]*/
757       "movq (%[src],%[ystride]),%%mm4\n\t"
758       /*mm5=[ref+ystride]*/
759       "movq (%[ref],%[ystride]),%%mm5\n\t"
760       /*Compute [src]-[ref].*/
761       "movq %%mm0,%%mm2\n\t"
762       "punpcklbw %%mm7,%%mm0\n\t"
763       "movq %%mm1,%%mm3\n\t"
764       "punpckhbw %%mm7,%%mm2\n\t"
765       "punpcklbw %%mm7,%%mm1\n\t"
766       "punpckhbw %%mm7,%%mm3\n\t"
767       "psubw %%mm1,%%mm0\n\t"
768       "psubw %%mm3,%%mm2\n\t"
769       /*Compute [src+ystride]-[ref+ystride].*/
770       "movq %%mm4,%%mm1\n\t"
771       "punpcklbw %%mm7,%%mm4\n\t"
772       "movq %%mm5,%%mm3\n\t"
773       "punpckhbw %%mm7,%%mm1\n\t"
774       "lea (%[src],%[ystride],2),%[src]\n\t"
775       "punpcklbw %%mm7,%%mm5\n\t"
776       "lea (%[ref],%[ystride],2),%[ref]\n\t"
777       "punpckhbw %%mm7,%%mm3\n\t"
778       "psubw %%mm5,%%mm4\n\t"
779       "psubw %%mm3,%%mm1\n\t"
780       /*Write the answer out.*/
781       "movq %%mm0,0x00(%[residue])\n\t"
782       "movq %%mm2,0x08(%[residue])\n\t"
783       "movq %%mm4,0x10(%[residue])\n\t"
784       "movq %%mm1,0x18(%[residue])\n\t"
785       "lea 0x20(%[residue]),%[residue]\n\t"
786       :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
787       :[ystride]"r"((ptrdiff_t)_ystride)
788       :"memory"
789     );
790   }
791 }
792 
oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],const unsigned char * _src,int _ystride)793 void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
794  const unsigned char *_src,int _ystride){
795   ptrdiff_t ystride3;
796   __asm__ __volatile__(
797     /*mm0=[src]*/
798     "movq (%[src]),%%mm0\n\t"
799     /*mm1=[src+ystride]*/
800     "movq (%[src],%[ystride]),%%mm1\n\t"
801     /*mm6={-1}x4*/
802     "pcmpeqw %%mm6,%%mm6\n\t"
803     /*mm2=[src+2*ystride]*/
804     "movq (%[src],%[ystride],2),%%mm2\n\t"
805     /*[ystride3]=3*[ystride]*/
806     "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
807     /*mm6={1}x4*/
808     "psllw $15,%%mm6\n\t"
809     /*mm3=[src+3*ystride]*/
810     "movq (%[src],%[ystride3]),%%mm3\n\t"
811     /*mm6={128}x4*/
812     "psrlw $8,%%mm6\n\t"
813     /*mm7=0*/
814     "pxor %%mm7,%%mm7\n\t"
815     /*[src]=[src]+4*[ystride]*/
816     "lea (%[src],%[ystride],4),%[src]\n\t"
817     /*Compute [src]-128 and [src+ystride]-128*/
818     "movq %%mm0,%%mm4\n\t"
819     "punpcklbw %%mm7,%%mm0\n\t"
820     "movq %%mm1,%%mm5\n\t"
821     "punpckhbw %%mm7,%%mm4\n\t"
822     "psubw %%mm6,%%mm0\n\t"
823     "punpcklbw %%mm7,%%mm1\n\t"
824     "psubw %%mm6,%%mm4\n\t"
825     "punpckhbw %%mm7,%%mm5\n\t"
826     "psubw %%mm6,%%mm1\n\t"
827     "psubw %%mm6,%%mm5\n\t"
828     /*Write the answer out.*/
829     "movq %%mm0,0x00(%[residue])\n\t"
830     "movq %%mm4,0x08(%[residue])\n\t"
831     "movq %%mm1,0x10(%[residue])\n\t"
832     "movq %%mm5,0x18(%[residue])\n\t"
833     /*mm0=[src+4*ystride]*/
834     "movq (%[src]),%%mm0\n\t"
835     /*mm1=[src+5*ystride]*/
836     "movq (%[src],%[ystride]),%%mm1\n\t"
837     /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
838     "movq %%mm2,%%mm4\n\t"
839     "punpcklbw %%mm7,%%mm2\n\t"
840     "movq %%mm3,%%mm5\n\t"
841     "punpckhbw %%mm7,%%mm4\n\t"
842     "psubw %%mm6,%%mm2\n\t"
843     "punpcklbw %%mm7,%%mm3\n\t"
844     "psubw %%mm6,%%mm4\n\t"
845     "punpckhbw %%mm7,%%mm5\n\t"
846     "psubw %%mm6,%%mm3\n\t"
847     "psubw %%mm6,%%mm5\n\t"
848     /*Write the answer out.*/
849     "movq %%mm2,0x20(%[residue])\n\t"
850     "movq %%mm4,0x28(%[residue])\n\t"
851     "movq %%mm3,0x30(%[residue])\n\t"
852     "movq %%mm5,0x38(%[residue])\n\t"
853     /*mm2=[src+6*ystride]*/
854     "movq (%[src],%[ystride],2),%%mm2\n\t"
855     /*mm3=[src+7*ystride]*/
856     "movq (%[src],%[ystride3]),%%mm3\n\t"
857     /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
858     "movq %%mm0,%%mm4\n\t"
859     "punpcklbw %%mm7,%%mm0\n\t"
860     "movq %%mm1,%%mm5\n\t"
861     "punpckhbw %%mm7,%%mm4\n\t"
862     "psubw %%mm6,%%mm0\n\t"
863     "punpcklbw %%mm7,%%mm1\n\t"
864     "psubw %%mm6,%%mm4\n\t"
865     "punpckhbw %%mm7,%%mm5\n\t"
866     "psubw %%mm6,%%mm1\n\t"
867     "psubw %%mm6,%%mm5\n\t"
868     /*Write the answer out.*/
869     "movq %%mm0,0x40(%[residue])\n\t"
870     "movq %%mm4,0x48(%[residue])\n\t"
871     "movq %%mm1,0x50(%[residue])\n\t"
872     "movq %%mm5,0x58(%[residue])\n\t"
873     /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
874     "movq %%mm2,%%mm4\n\t"
875     "punpcklbw %%mm7,%%mm2\n\t"
876     "movq %%mm3,%%mm5\n\t"
877     "punpckhbw %%mm7,%%mm4\n\t"
878     "psubw %%mm6,%%mm2\n\t"
879     "punpcklbw %%mm7,%%mm3\n\t"
880     "psubw %%mm6,%%mm4\n\t"
881     "punpckhbw %%mm7,%%mm5\n\t"
882     "psubw %%mm6,%%mm3\n\t"
883     "psubw %%mm6,%%mm5\n\t"
884     /*Write the answer out.*/
885     "movq %%mm2,0x60(%[residue])\n\t"
886     "movq %%mm4,0x68(%[residue])\n\t"
887     "movq %%mm3,0x70(%[residue])\n\t"
888     "movq %%mm5,0x78(%[residue])\n\t"
889     :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
890     :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
891     :"memory"
892   );
893 }
894 
oc_enc_frag_copy2_mmxext(unsigned char * _dst,const unsigned char * _src1,const unsigned char * _src2,int _ystride)895 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
896  const unsigned char *_src1,const unsigned char *_src2,int _ystride){
897   oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
898 }
899 
900 #endif
901