1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14     last mod: $Id$
15 
16  ********************************************************************/
17 
18 /*MMX acceleration of fragment reconstruction for motion compensation.
19   Originally written by Rudolf Marek.
20   Additional optimization by Nils Pipenbrinck.
21   Note: Loops are unrolled for best performance.
22   The iteration each instruction belongs to is marked in the comments as #i.*/
23 #include <stddef.h>
24 #include "x86int.h"
25 
26 #if defined(OC_X86_ASM)
27 
28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
29    between rows.*/
30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
31   do{ \
32     const unsigned char *src; \
33     unsigned char       *dst; \
34     src=(_src); \
35     dst=(_dst); \
36     __asm  mov SRC,src \
37     __asm  mov DST,dst \
38     __asm  mov YSTRIDE,_ystride \
39     /*src+0*ystride*/ \
40     __asm  movq mm0,[SRC] \
41     /*src+1*ystride*/ \
42     __asm  movq mm1,[SRC+YSTRIDE] \
43     /*ystride3=ystride*3*/ \
44     __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
45     /*src+2*ystride*/ \
46     __asm  movq mm2,[SRC+YSTRIDE*2] \
47     /*src+3*ystride*/ \
48     __asm  movq mm3,[SRC+YSTRIDE3] \
49     /*dst+0*ystride*/ \
50     __asm  movq [DST],mm0 \
51     /*dst+1*ystride*/ \
52     __asm  movq [DST+YSTRIDE],mm1 \
53     /*Pointer to next 4.*/ \
54     __asm  lea SRC,[SRC+YSTRIDE*4] \
55     /*dst+2*ystride*/ \
56     __asm  movq [DST+YSTRIDE*2],mm2 \
57     /*dst+3*ystride*/ \
58     __asm  movq [DST+YSTRIDE3],mm3 \
59     /*Pointer to next 4.*/ \
60     __asm  lea DST,[DST+YSTRIDE*4] \
61     /*src+0*ystride*/ \
62     __asm  movq mm0,[SRC] \
63     /*src+1*ystride*/ \
64     __asm  movq mm1,[SRC+YSTRIDE] \
65     /*src+2*ystride*/ \
66     __asm  movq mm2,[SRC+YSTRIDE*2] \
67     /*src+3*ystride*/ \
68     __asm  movq mm3,[SRC+YSTRIDE3] \
69     /*dst+0*ystride*/ \
70     __asm  movq [DST],mm0 \
71     /*dst+1*ystride*/ \
72     __asm  movq [DST+YSTRIDE],mm1 \
73     /*dst+2*ystride*/ \
74     __asm  movq [DST+YSTRIDE*2],mm2 \
75     /*dst+3*ystride*/ \
76     __asm  movq [DST+YSTRIDE3],mm3 \
77   } \
78   while(0)
79 
80 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
81    between rows.*/
oc_frag_copy_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride)82 void oc_frag_copy_mmx(unsigned char *_dst,
83  const unsigned char *_src,int _ystride){
84 #define SRC edx
85 #define DST eax
86 #define YSTRIDE ecx
87 #define YSTRIDE3 esi
88   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
89 #undef SRC
90 #undef DST
91 #undef YSTRIDE
92 #undef YSTRIDE3
93 }
94 
95 /*Copies the fragments specified by the lists of fragment indices from one
96    frame to another.
97   _dst_frame:     The reference frame to copy to.
98   _src_frame:     The reference frame to copy from.
99   _ystride:       The row stride of the reference frames.
100   _fragis:        A pointer to a list of fragment indices.
101   _nfragis:       The number of fragment indices to copy.
102   _frag_buf_offs: The offsets of fragments in the reference frames.*/
oc_frag_copy_list_mmx(unsigned char * _dst_frame,const unsigned char * _src_frame,int _ystride,const ptrdiff_t * _fragis,ptrdiff_t _nfragis,const ptrdiff_t * _frag_buf_offs)103 void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
104  const unsigned char *_src_frame,int _ystride,
105  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
106   ptrdiff_t fragii;
107   for(fragii=0;fragii<_nfragis;fragii++){
108     ptrdiff_t frag_buf_off;
109     frag_buf_off=_frag_buf_offs[_fragis[fragii]];
110 #define SRC edx
111 #define DST eax
112 #define YSTRIDE ecx
113 #define YSTRIDE3 edi
114     OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
115      _src_frame+frag_buf_off,_ystride);
116 #undef SRC
117 #undef DST
118 #undef YSTRIDE
119 #undef YSTRIDE3
120   }
121 }
122 
oc_frag_recon_intra_mmx(unsigned char * _dst,int _ystride,const ogg_int16_t * _residue)123 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
124  const ogg_int16_t *_residue){
125   __asm{
126 #define DST edx
127 #define DST4 esi
128 #define YSTRIDE eax
129 #define YSTRIDE3 edi
130 #define RESIDUE ecx
131     mov DST,_dst
132     mov YSTRIDE,_ystride
133     mov RESIDUE,_residue
134     lea DST4,[DST+YSTRIDE*4]
135     lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
136     /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
137     pcmpeqw mm0,mm0
138     /*#0 Load low residue.*/
139     movq mm1,[0*8+RESIDUE]
140     /*#0 Load high residue.*/
141     movq mm2,[1*8+RESIDUE]
142     /*Set mm0 to 0x8000800080008000.*/
143     psllw mm0,15
144     /*#1 Load low residue.*/
145     movq mm3,[2*8+RESIDUE]
146     /*#1 Load high residue.*/
147     movq mm4,[3*8+RESIDUE]
148     /*Set mm0 to 0x0080008000800080.*/
149     psrlw mm0,8
150     /*#2 Load low residue.*/
151     movq mm5,[4*8+RESIDUE]
152     /*#2 Load high residue.*/
153     movq mm6,[5*8+RESIDUE]
154     /*#0 Bias low  residue.*/
155     paddsw mm1,mm0
156     /*#0 Bias high residue.*/
157     paddsw mm2,mm0
158     /*#0 Pack to byte.*/
159     packuswb mm1,mm2
160     /*#1 Bias low  residue.*/
161     paddsw mm3,mm0
162     /*#1 Bias high residue.*/
163     paddsw mm4,mm0
164     /*#1 Pack to byte.*/
165     packuswb mm3,mm4
166     /*#2 Bias low  residue.*/
167     paddsw mm5,mm0
168     /*#2 Bias high residue.*/
169     paddsw mm6,mm0
170     /*#2 Pack to byte.*/
171     packuswb mm5,mm6
172     /*#0 Write row.*/
173     movq [DST],mm1
174     /*#1 Write row.*/
175     movq [DST+YSTRIDE],mm3
176     /*#2 Write row.*/
177     movq [DST+YSTRIDE*2],mm5
178     /*#3 Load low residue.*/
179     movq mm1,[6*8+RESIDUE]
180     /*#3 Load high residue.*/
181     movq mm2,[7*8+RESIDUE]
182     /*#4 Load high residue.*/
183     movq mm3,[8*8+RESIDUE]
184     /*#4 Load high residue.*/
185     movq mm4,[9*8+RESIDUE]
186     /*#5 Load high residue.*/
187     movq mm5,[10*8+RESIDUE]
188     /*#5 Load high residue.*/
189     movq mm6,[11*8+RESIDUE]
190     /*#3 Bias low  residue.*/
191     paddsw mm1,mm0
192     /*#3 Bias high residue.*/
193     paddsw mm2,mm0
194     /*#3 Pack to byte.*/
195     packuswb mm1,mm2
196     /*#4 Bias low  residue.*/
197     paddsw mm3,mm0
198     /*#4 Bias high residue.*/
199     paddsw mm4,mm0
200     /*#4 Pack to byte.*/
201     packuswb mm3,mm4
202     /*#5 Bias low  residue.*/
203     paddsw mm5,mm0
204     /*#5 Bias high residue.*/
205     paddsw mm6,mm0
206     /*#5 Pack to byte.*/
207     packuswb mm5,mm6
208     /*#3 Write row.*/
209     movq [DST+YSTRIDE3],mm1
210     /*#4 Write row.*/
211     movq [DST4],mm3
212     /*#5 Write row.*/
213     movq [DST4+YSTRIDE],mm5
214     /*#6 Load low residue.*/
215     movq mm1,[12*8+RESIDUE]
216     /*#6 Load high residue.*/
217     movq mm2,[13*8+RESIDUE]
218     /*#7 Load low residue.*/
219     movq mm3,[14*8+RESIDUE]
220     /*#7 Load high residue.*/
221     movq mm4,[15*8+RESIDUE]
222     /*#6 Bias low  residue.*/
223     paddsw mm1,mm0
224     /*#6 Bias high residue.*/
225     paddsw mm2,mm0
226     /*#6 Pack to byte.*/
227     packuswb mm1,mm2
228     /*#7 Bias low  residue.*/
229     paddsw mm3,mm0
230     /*#7 Bias high residue.*/
231     paddsw mm4,mm0
232     /*#7 Pack to byte.*/
233     packuswb mm3,mm4
234     /*#6 Write row.*/
235     movq [DST4+YSTRIDE*2],mm1
236     /*#7 Write row.*/
237     movq [DST4+YSTRIDE3],mm3
238 #undef DST
239 #undef DST4
240 #undef YSTRIDE
241 #undef YSTRIDE3
242 #undef RESIDUE
243   }
244 }
245 
oc_frag_recon_inter_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride,const ogg_int16_t * _residue)246 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
247  int _ystride,const ogg_int16_t *_residue){
248   int i;
249   /*Zero mm0.*/
250   __asm pxor mm0,mm0;
251   for(i=4;i-->0;){
252     __asm{
253 #define DST edx
254 #define SRC ecx
255 #define YSTRIDE edi
256 #define RESIDUE eax
257       mov DST,_dst
258       mov SRC,_src
259       mov YSTRIDE,_ystride
260       mov RESIDUE,_residue
261       /*#0 Load source.*/
262       movq mm3,[SRC]
263       /*#1 Load source.*/
264       movq mm7,[SRC+YSTRIDE]
265       /*#0 Get copy of src.*/
266       movq mm4,mm3
267       /*#0 Expand high source.*/
268       punpckhbw mm4,mm0
269       /*#0 Expand low  source.*/
270       punpcklbw mm3,mm0
271       /*#0 Add residue high.*/
272       paddsw mm4,[8+RESIDUE]
273       /*#1 Get copy of src.*/
274       movq mm2,mm7
275       /*#0 Add residue low.*/
276       paddsw  mm3,[RESIDUE]
277       /*#1 Expand high source.*/
278       punpckhbw mm2,mm0
279       /*#0 Pack final row pixels.*/
280       packuswb mm3,mm4
281       /*#1 Expand low  source.*/
282       punpcklbw mm7,mm0
283       /*#1 Add residue low.*/
284       paddsw mm7,[16+RESIDUE]
285       /*#1 Add residue high.*/
286       paddsw mm2,[24+RESIDUE]
287       /*Advance residue.*/
288       lea RESIDUE,[32+RESIDUE]
289       /*#1 Pack final row pixels.*/
290       packuswb mm7,mm2
291       /*Advance src.*/
292       lea SRC,[SRC+YSTRIDE*2]
293       /*#0 Write row.*/
294       movq [DST],mm3
295       /*#1 Write row.*/
296       movq [DST+YSTRIDE],mm7
297       /*Advance dst.*/
298       lea DST,[DST+YSTRIDE*2]
299       mov _residue,RESIDUE
300       mov _dst,DST
301       mov _src,SRC
302 #undef DST
303 #undef SRC
304 #undef YSTRIDE
305 #undef RESIDUE
306     }
307   }
308 }
309 
oc_frag_recon_inter2_mmx(unsigned char * _dst,const unsigned char * _src1,const unsigned char * _src2,int _ystride,const ogg_int16_t * _residue)310 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
311  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
312   int i;
313   /*Zero mm7.*/
314   __asm pxor mm7,mm7;
315   for(i=4;i-->0;){
316     __asm{
317 #define SRC1 ecx
318 #define SRC2 edi
319 #define YSTRIDE esi
320 #define RESIDUE edx
321 #define DST eax
322       mov YSTRIDE,_ystride
323       mov DST,_dst
324       mov RESIDUE,_residue
325       mov SRC1,_src1
326       mov SRC2,_src2
327       /*#0 Load src1.*/
328       movq mm0,[SRC1]
329       /*#0 Load src2.*/
330       movq mm2,[SRC2]
331       /*#0 Copy src1.*/
332       movq mm1,mm0
333       /*#0 Copy src2.*/
334       movq mm3,mm2
335       /*#1 Load src1.*/
336       movq mm4,[SRC1+YSTRIDE]
337       /*#0 Unpack lower src1.*/
338       punpcklbw mm0,mm7
339       /*#1 Load src2.*/
340       movq mm5,[SRC2+YSTRIDE]
341       /*#0 Unpack higher src1.*/
342       punpckhbw mm1,mm7
343       /*#0 Unpack lower src2.*/
344       punpcklbw mm2,mm7
345       /*#0 Unpack higher src2.*/
346       punpckhbw mm3,mm7
347       /*Advance src1 ptr.*/
348       lea SRC1,[SRC1+YSTRIDE*2]
349       /*Advance src2 ptr.*/
350       lea SRC2,[SRC2+YSTRIDE*2]
351       /*#0 Lower src1+src2.*/
352       paddsw mm0,mm2
353       /*#0 Higher src1+src2.*/
354       paddsw mm1,mm3
355       /*#1 Copy src1.*/
356       movq mm2,mm4
357       /*#0 Build lo average.*/
358       psraw mm0,1
359       /*#1 Copy src2.*/
360       movq mm3,mm5
361       /*#1 Unpack lower src1.*/
362       punpcklbw mm4,mm7
363       /*#0 Build hi average.*/
364       psraw mm1,1
365       /*#1 Unpack higher src1.*/
366       punpckhbw mm2,mm7
367       /*#0 low+=residue.*/
368       paddsw mm0,[RESIDUE]
369       /*#1 Unpack lower src2.*/
370       punpcklbw mm5,mm7
371       /*#0 high+=residue.*/
372       paddsw mm1,[8+RESIDUE]
373       /*#1 Unpack higher src2.*/
374       punpckhbw mm3,mm7
375       /*#1 Lower src1+src2.*/
376       paddsw mm5,mm4
377       /*#0 Pack and saturate.*/
378       packuswb mm0,mm1
379       /*#1 Higher src1+src2.*/
380       paddsw mm3,mm2
381       /*#0 Write row.*/
382       movq [DST],mm0
383       /*#1 Build lo average.*/
384       psraw mm5,1
385       /*#1 Build hi average.*/
386       psraw mm3,1
387       /*#1 low+=residue.*/
388       paddsw mm5,[16+RESIDUE]
389       /*#1 high+=residue.*/
390       paddsw mm3,[24+RESIDUE]
391       /*#1 Pack and saturate.*/
392       packuswb  mm5,mm3
393       /*#1 Write row ptr.*/
394       movq [DST+YSTRIDE],mm5
395       /*Advance residue ptr.*/
396       add RESIDUE,32
397       /*Advance dest ptr.*/
398       lea DST,[DST+YSTRIDE*2]
399       mov _dst,DST
400       mov _residue,RESIDUE
401       mov _src1,SRC1
402       mov _src2,SRC2
403 #undef SRC1
404 #undef SRC2
405 #undef YSTRIDE
406 #undef RESIDUE
407 #undef DST
408     }
409   }
410 }
411 
oc_restore_fpu_mmx(void)412 void oc_restore_fpu_mmx(void){
413   __asm emms;
414 }
415 
416 #endif
417