1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14     last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
15 
16  ********************************************************************/
17 
18 /*MMX acceleration of fragment reconstruction for motion compensation.
19   Originally written by Rudolf Marek.
20   Additional optimization by Nils Pipenbrinck.
21   Note: Loops are unrolled for best performance.
22   The iteration each instruction belongs to is marked in the comments as #i.*/
23 #include <stddef.h>
24 #include "x86int.h"
25 #include "mmxfrag.h"
26 
27 #if defined(OC_X86_ASM)
28 
29 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
30    between rows.*/
oc_frag_copy_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride)31 void oc_frag_copy_mmx(unsigned char *_dst,
32  const unsigned char *_src,int _ystride){
33 #define SRC edx
34 #define DST eax
35 #define YSTRIDE ecx
36 #define YSTRIDE3 esi
37   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
38 #undef SRC
39 #undef DST
40 #undef YSTRIDE
41 #undef YSTRIDE3
42 }
43 
oc_frag_recon_intra_mmx(unsigned char * _dst,int _ystride,const ogg_int16_t * _residue)44 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
45  const ogg_int16_t *_residue){
46   __asm{
47 #define DST edx
48 #define DST4 esi
49 #define YSTRIDE eax
50 #define YSTRIDE3 edi
51 #define RESIDUE ecx
52     mov DST,_dst
53     mov YSTRIDE,_ystride
54     mov RESIDUE,_residue
55     lea DST4,[DST+YSTRIDE*4]
56     lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
57     /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
58     pcmpeqw mm0,mm0
59     /*#0 Load low residue.*/
60     movq mm1,[0*8+RESIDUE]
61     /*#0 Load high residue.*/
62     movq mm2,[1*8+RESIDUE]
63     /*Set mm0 to 0x8000800080008000.*/
64     psllw mm0,15
65     /*#1 Load low residue.*/
66     movq mm3,[2*8+RESIDUE]
67     /*#1 Load high residue.*/
68     movq mm4,[3*8+RESIDUE]
69     /*Set mm0 to 0x0080008000800080.*/
70     psrlw mm0,8
71     /*#2 Load low residue.*/
72     movq mm5,[4*8+RESIDUE]
73     /*#2 Load high residue.*/
74     movq mm6,[5*8+RESIDUE]
75     /*#0 Bias low  residue.*/
76     paddsw mm1,mm0
77     /*#0 Bias high residue.*/
78     paddsw mm2,mm0
79     /*#0 Pack to byte.*/
80     packuswb mm1,mm2
81     /*#1 Bias low  residue.*/
82     paddsw mm3,mm0
83     /*#1 Bias high residue.*/
84     paddsw mm4,mm0
85     /*#1 Pack to byte.*/
86     packuswb mm3,mm4
87     /*#2 Bias low  residue.*/
88     paddsw mm5,mm0
89     /*#2 Bias high residue.*/
90     paddsw mm6,mm0
91     /*#2 Pack to byte.*/
92     packuswb mm5,mm6
93     /*#0 Write row.*/
94     movq [DST],mm1
95     /*#1 Write row.*/
96     movq [DST+YSTRIDE],mm3
97     /*#2 Write row.*/
98     movq [DST+YSTRIDE*2],mm5
99     /*#3 Load low residue.*/
100     movq mm1,[6*8+RESIDUE]
101     /*#3 Load high residue.*/
102     movq mm2,[7*8+RESIDUE]
103     /*#4 Load high residue.*/
104     movq mm3,[8*8+RESIDUE]
105     /*#4 Load high residue.*/
106     movq mm4,[9*8+RESIDUE]
107     /*#5 Load high residue.*/
108     movq mm5,[10*8+RESIDUE]
109     /*#5 Load high residue.*/
110     movq mm6,[11*8+RESIDUE]
111     /*#3 Bias low  residue.*/
112     paddsw mm1,mm0
113     /*#3 Bias high residue.*/
114     paddsw mm2,mm0
115     /*#3 Pack to byte.*/
116     packuswb mm1,mm2
117     /*#4 Bias low  residue.*/
118     paddsw mm3,mm0
119     /*#4 Bias high residue.*/
120     paddsw mm4,mm0
121     /*#4 Pack to byte.*/
122     packuswb mm3,mm4
123     /*#5 Bias low  residue.*/
124     paddsw mm5,mm0
125     /*#5 Bias high residue.*/
126     paddsw mm6,mm0
127     /*#5 Pack to byte.*/
128     packuswb mm5,mm6
129     /*#3 Write row.*/
130     movq [DST+YSTRIDE3],mm1
131     /*#4 Write row.*/
132     movq [DST4],mm3
133     /*#5 Write row.*/
134     movq [DST4+YSTRIDE],mm5
135     /*#6 Load low residue.*/
136     movq mm1,[12*8+RESIDUE]
137     /*#6 Load high residue.*/
138     movq mm2,[13*8+RESIDUE]
139     /*#7 Load low residue.*/
140     movq mm3,[14*8+RESIDUE]
141     /*#7 Load high residue.*/
142     movq mm4,[15*8+RESIDUE]
143     /*#6 Bias low  residue.*/
144     paddsw mm1,mm0
145     /*#6 Bias high residue.*/
146     paddsw mm2,mm0
147     /*#6 Pack to byte.*/
148     packuswb mm1,mm2
149     /*#7 Bias low  residue.*/
150     paddsw mm3,mm0
151     /*#7 Bias high residue.*/
152     paddsw mm4,mm0
153     /*#7 Pack to byte.*/
154     packuswb mm3,mm4
155     /*#6 Write row.*/
156     movq [DST4+YSTRIDE*2],mm1
157     /*#7 Write row.*/
158     movq [DST4+YSTRIDE3],mm3
159 #undef DST
160 #undef DST4
161 #undef YSTRIDE
162 #undef YSTRIDE3
163 #undef RESIDUE
164   }
165 }
166 
oc_frag_recon_inter_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride,const ogg_int16_t * _residue)167 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
168  int _ystride,const ogg_int16_t *_residue){
169   int i;
170   /*Zero mm0.*/
171   __asm pxor mm0,mm0;
172   for(i=4;i-->0;){
173     __asm{
174 #define DST edx
175 #define SRC ecx
176 #define YSTRIDE edi
177 #define RESIDUE eax
178       mov DST,_dst
179       mov SRC,_src
180       mov YSTRIDE,_ystride
181       mov RESIDUE,_residue
182       /*#0 Load source.*/
183       movq mm3,[SRC]
184       /*#1 Load source.*/
185       movq mm7,[SRC+YSTRIDE]
186       /*#0 Get copy of src.*/
187       movq mm4,mm3
188       /*#0 Expand high source.*/
189       punpckhbw mm4,mm0
190       /*#0 Expand low  source.*/
191       punpcklbw mm3,mm0
192       /*#0 Add residue high.*/
193       paddsw mm4,[8+RESIDUE]
194       /*#1 Get copy of src.*/
195       movq mm2,mm7
196       /*#0 Add residue low.*/
197       paddsw  mm3,[RESIDUE]
198       /*#1 Expand high source.*/
199       punpckhbw mm2,mm0
200       /*#0 Pack final row pixels.*/
201       packuswb mm3,mm4
202       /*#1 Expand low  source.*/
203       punpcklbw mm7,mm0
204       /*#1 Add residue low.*/
205       paddsw mm7,[16+RESIDUE]
206       /*#1 Add residue high.*/
207       paddsw mm2,[24+RESIDUE]
208       /*Advance residue.*/
209       lea RESIDUE,[32+RESIDUE]
210       /*#1 Pack final row pixels.*/
211       packuswb mm7,mm2
212       /*Advance src.*/
213       lea SRC,[SRC+YSTRIDE*2]
214       /*#0 Write row.*/
215       movq [DST],mm3
216       /*#1 Write row.*/
217       movq [DST+YSTRIDE],mm7
218       /*Advance dst.*/
219       lea DST,[DST+YSTRIDE*2]
220       mov _residue,RESIDUE
221       mov _dst,DST
222       mov _src,SRC
223 #undef DST
224 #undef SRC
225 #undef YSTRIDE
226 #undef RESIDUE
227     }
228   }
229 }
230 
oc_frag_recon_inter2_mmx(unsigned char * _dst,const unsigned char * _src1,const unsigned char * _src2,int _ystride,const ogg_int16_t * _residue)231 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
232  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
233   int i;
234   /*Zero mm7.*/
235   __asm pxor mm7,mm7;
236   for(i=4;i-->0;){
237     __asm{
238 #define SRC1 ecx
239 #define SRC2 edi
240 #define YSTRIDE esi
241 #define RESIDUE edx
242 #define DST eax
243       mov YSTRIDE,_ystride
244       mov DST,_dst
245       mov RESIDUE,_residue
246       mov SRC1,_src1
247       mov SRC2,_src2
248       /*#0 Load src1.*/
249       movq mm0,[SRC1]
250       /*#0 Load src2.*/
251       movq mm2,[SRC2]
252       /*#0 Copy src1.*/
253       movq mm1,mm0
254       /*#0 Copy src2.*/
255       movq mm3,mm2
256       /*#1 Load src1.*/
257       movq mm4,[SRC1+YSTRIDE]
258       /*#0 Unpack lower src1.*/
259       punpcklbw mm0,mm7
260       /*#1 Load src2.*/
261       movq mm5,[SRC2+YSTRIDE]
262       /*#0 Unpack higher src1.*/
263       punpckhbw mm1,mm7
264       /*#0 Unpack lower src2.*/
265       punpcklbw mm2,mm7
266       /*#0 Unpack higher src2.*/
267       punpckhbw mm3,mm7
268       /*Advance src1 ptr.*/
269       lea SRC1,[SRC1+YSTRIDE*2]
270       /*Advance src2 ptr.*/
271       lea SRC2,[SRC2+YSTRIDE*2]
272       /*#0 Lower src1+src2.*/
273       paddsw mm0,mm2
274       /*#0 Higher src1+src2.*/
275       paddsw mm1,mm3
276       /*#1 Copy src1.*/
277       movq mm2,mm4
278       /*#0 Build lo average.*/
279       psraw mm0,1
280       /*#1 Copy src2.*/
281       movq mm3,mm5
282       /*#1 Unpack lower src1.*/
283       punpcklbw mm4,mm7
284       /*#0 Build hi average.*/
285       psraw mm1,1
286       /*#1 Unpack higher src1.*/
287       punpckhbw mm2,mm7
288       /*#0 low+=residue.*/
289       paddsw mm0,[RESIDUE]
290       /*#1 Unpack lower src2.*/
291       punpcklbw mm5,mm7
292       /*#0 high+=residue.*/
293       paddsw mm1,[8+RESIDUE]
294       /*#1 Unpack higher src2.*/
295       punpckhbw mm3,mm7
296       /*#1 Lower src1+src2.*/
297       paddsw mm5,mm4
298       /*#0 Pack and saturate.*/
299       packuswb mm0,mm1
300       /*#1 Higher src1+src2.*/
301       paddsw mm3,mm2
302       /*#0 Write row.*/
303       movq [DST],mm0
304       /*#1 Build lo average.*/
305       psraw mm5,1
306       /*#1 Build hi average.*/
307       psraw mm3,1
308       /*#1 low+=residue.*/
309       paddsw mm5,[16+RESIDUE]
310       /*#1 high+=residue.*/
311       paddsw mm3,[24+RESIDUE]
312       /*#1 Pack and saturate.*/
313       packuswb  mm5,mm3
314       /*#1 Write row ptr.*/
315       movq [DST+YSTRIDE],mm5
316       /*Advance residue ptr.*/
317       add RESIDUE,32
318       /*Advance dest ptr.*/
319       lea DST,[DST+YSTRIDE*2]
320       mov _dst,DST
321       mov _residue,RESIDUE
322       mov _src1,SRC1
323       mov _src2,SRC2
324 #undef SRC1
325 #undef SRC2
326 #undef YSTRIDE
327 #undef RESIDUE
328 #undef DST
329     }
330   }
331 }
332 
oc_restore_fpu_mmx(void)333 void oc_restore_fpu_mmx(void){
334   __asm emms;
335 }
336 
337 #endif
338