1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
15
16 ********************************************************************/
17
18 /*MMX acceleration of fragment reconstruction for motion compensation.
19 Originally written by Rudolf Marek.
20 Additional optimization by Nils Pipenbrinck.
21 Note: Loops are unrolled for best performance.
22 The iteration each instruction belongs to is marked in the comments as #i.*/
23 #include <stddef.h>
24 #include "x86int.h"
25 #include "mmxfrag.h"
26
27 #if defined(OC_X86_ASM)
28
29 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
30 between rows.*/
oc_frag_copy_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride)31 void oc_frag_copy_mmx(unsigned char *_dst,
32 const unsigned char *_src,int _ystride){
33 #define SRC edx
34 #define DST eax
35 #define YSTRIDE ecx
36 #define YSTRIDE3 esi
37 OC_FRAG_COPY_MMX(_dst,_src,_ystride);
38 #undef SRC
39 #undef DST
40 #undef YSTRIDE
41 #undef YSTRIDE3
42 }
43
oc_frag_recon_intra_mmx(unsigned char * _dst,int _ystride,const ogg_int16_t * _residue)44 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
45 const ogg_int16_t *_residue){
46 __asm{
47 #define DST edx
48 #define DST4 esi
49 #define YSTRIDE eax
50 #define YSTRIDE3 edi
51 #define RESIDUE ecx
52 mov DST,_dst
53 mov YSTRIDE,_ystride
54 mov RESIDUE,_residue
55 lea DST4,[DST+YSTRIDE*4]
56 lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
57 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
58 pcmpeqw mm0,mm0
59 /*#0 Load low residue.*/
60 movq mm1,[0*8+RESIDUE]
61 /*#0 Load high residue.*/
62 movq mm2,[1*8+RESIDUE]
63 /*Set mm0 to 0x8000800080008000.*/
64 psllw mm0,15
65 /*#1 Load low residue.*/
66 movq mm3,[2*8+RESIDUE]
67 /*#1 Load high residue.*/
68 movq mm4,[3*8+RESIDUE]
69 /*Set mm0 to 0x0080008000800080.*/
70 psrlw mm0,8
71 /*#2 Load low residue.*/
72 movq mm5,[4*8+RESIDUE]
73 /*#2 Load high residue.*/
74 movq mm6,[5*8+RESIDUE]
75 /*#0 Bias low residue.*/
76 paddsw mm1,mm0
77 /*#0 Bias high residue.*/
78 paddsw mm2,mm0
79 /*#0 Pack to byte.*/
80 packuswb mm1,mm2
81 /*#1 Bias low residue.*/
82 paddsw mm3,mm0
83 /*#1 Bias high residue.*/
84 paddsw mm4,mm0
85 /*#1 Pack to byte.*/
86 packuswb mm3,mm4
87 /*#2 Bias low residue.*/
88 paddsw mm5,mm0
89 /*#2 Bias high residue.*/
90 paddsw mm6,mm0
91 /*#2 Pack to byte.*/
92 packuswb mm5,mm6
93 /*#0 Write row.*/
94 movq [DST],mm1
95 /*#1 Write row.*/
96 movq [DST+YSTRIDE],mm3
97 /*#2 Write row.*/
98 movq [DST+YSTRIDE*2],mm5
99 /*#3 Load low residue.*/
100 movq mm1,[6*8+RESIDUE]
101 /*#3 Load high residue.*/
102 movq mm2,[7*8+RESIDUE]
103 /*#4 Load high residue.*/
104 movq mm3,[8*8+RESIDUE]
105 /*#4 Load high residue.*/
106 movq mm4,[9*8+RESIDUE]
107 /*#5 Load high residue.*/
108 movq mm5,[10*8+RESIDUE]
109 /*#5 Load high residue.*/
110 movq mm6,[11*8+RESIDUE]
111 /*#3 Bias low residue.*/
112 paddsw mm1,mm0
113 /*#3 Bias high residue.*/
114 paddsw mm2,mm0
115 /*#3 Pack to byte.*/
116 packuswb mm1,mm2
117 /*#4 Bias low residue.*/
118 paddsw mm3,mm0
119 /*#4 Bias high residue.*/
120 paddsw mm4,mm0
121 /*#4 Pack to byte.*/
122 packuswb mm3,mm4
123 /*#5 Bias low residue.*/
124 paddsw mm5,mm0
125 /*#5 Bias high residue.*/
126 paddsw mm6,mm0
127 /*#5 Pack to byte.*/
128 packuswb mm5,mm6
129 /*#3 Write row.*/
130 movq [DST+YSTRIDE3],mm1
131 /*#4 Write row.*/
132 movq [DST4],mm3
133 /*#5 Write row.*/
134 movq [DST4+YSTRIDE],mm5
135 /*#6 Load low residue.*/
136 movq mm1,[12*8+RESIDUE]
137 /*#6 Load high residue.*/
138 movq mm2,[13*8+RESIDUE]
139 /*#7 Load low residue.*/
140 movq mm3,[14*8+RESIDUE]
141 /*#7 Load high residue.*/
142 movq mm4,[15*8+RESIDUE]
143 /*#6 Bias low residue.*/
144 paddsw mm1,mm0
145 /*#6 Bias high residue.*/
146 paddsw mm2,mm0
147 /*#6 Pack to byte.*/
148 packuswb mm1,mm2
149 /*#7 Bias low residue.*/
150 paddsw mm3,mm0
151 /*#7 Bias high residue.*/
152 paddsw mm4,mm0
153 /*#7 Pack to byte.*/
154 packuswb mm3,mm4
155 /*#6 Write row.*/
156 movq [DST4+YSTRIDE*2],mm1
157 /*#7 Write row.*/
158 movq [DST4+YSTRIDE3],mm3
159 #undef DST
160 #undef DST4
161 #undef YSTRIDE
162 #undef YSTRIDE3
163 #undef RESIDUE
164 }
165 }
166
oc_frag_recon_inter_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride,const ogg_int16_t * _residue)167 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
168 int _ystride,const ogg_int16_t *_residue){
169 int i;
170 /*Zero mm0.*/
171 __asm pxor mm0,mm0;
172 for(i=4;i-->0;){
173 __asm{
174 #define DST edx
175 #define SRC ecx
176 #define YSTRIDE edi
177 #define RESIDUE eax
178 mov DST,_dst
179 mov SRC,_src
180 mov YSTRIDE,_ystride
181 mov RESIDUE,_residue
182 /*#0 Load source.*/
183 movq mm3,[SRC]
184 /*#1 Load source.*/
185 movq mm7,[SRC+YSTRIDE]
186 /*#0 Get copy of src.*/
187 movq mm4,mm3
188 /*#0 Expand high source.*/
189 punpckhbw mm4,mm0
190 /*#0 Expand low source.*/
191 punpcklbw mm3,mm0
192 /*#0 Add residue high.*/
193 paddsw mm4,[8+RESIDUE]
194 /*#1 Get copy of src.*/
195 movq mm2,mm7
196 /*#0 Add residue low.*/
197 paddsw mm3,[RESIDUE]
198 /*#1 Expand high source.*/
199 punpckhbw mm2,mm0
200 /*#0 Pack final row pixels.*/
201 packuswb mm3,mm4
202 /*#1 Expand low source.*/
203 punpcklbw mm7,mm0
204 /*#1 Add residue low.*/
205 paddsw mm7,[16+RESIDUE]
206 /*#1 Add residue high.*/
207 paddsw mm2,[24+RESIDUE]
208 /*Advance residue.*/
209 lea RESIDUE,[32+RESIDUE]
210 /*#1 Pack final row pixels.*/
211 packuswb mm7,mm2
212 /*Advance src.*/
213 lea SRC,[SRC+YSTRIDE*2]
214 /*#0 Write row.*/
215 movq [DST],mm3
216 /*#1 Write row.*/
217 movq [DST+YSTRIDE],mm7
218 /*Advance dst.*/
219 lea DST,[DST+YSTRIDE*2]
220 mov _residue,RESIDUE
221 mov _dst,DST
222 mov _src,SRC
223 #undef DST
224 #undef SRC
225 #undef YSTRIDE
226 #undef RESIDUE
227 }
228 }
229 }
230
oc_frag_recon_inter2_mmx(unsigned char * _dst,const unsigned char * _src1,const unsigned char * _src2,int _ystride,const ogg_int16_t * _residue)231 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
232 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
233 int i;
234 /*Zero mm7.*/
235 __asm pxor mm7,mm7;
236 for(i=4;i-->0;){
237 __asm{
238 #define SRC1 ecx
239 #define SRC2 edi
240 #define YSTRIDE esi
241 #define RESIDUE edx
242 #define DST eax
243 mov YSTRIDE,_ystride
244 mov DST,_dst
245 mov RESIDUE,_residue
246 mov SRC1,_src1
247 mov SRC2,_src2
248 /*#0 Load src1.*/
249 movq mm0,[SRC1]
250 /*#0 Load src2.*/
251 movq mm2,[SRC2]
252 /*#0 Copy src1.*/
253 movq mm1,mm0
254 /*#0 Copy src2.*/
255 movq mm3,mm2
256 /*#1 Load src1.*/
257 movq mm4,[SRC1+YSTRIDE]
258 /*#0 Unpack lower src1.*/
259 punpcklbw mm0,mm7
260 /*#1 Load src2.*/
261 movq mm5,[SRC2+YSTRIDE]
262 /*#0 Unpack higher src1.*/
263 punpckhbw mm1,mm7
264 /*#0 Unpack lower src2.*/
265 punpcklbw mm2,mm7
266 /*#0 Unpack higher src2.*/
267 punpckhbw mm3,mm7
268 /*Advance src1 ptr.*/
269 lea SRC1,[SRC1+YSTRIDE*2]
270 /*Advance src2 ptr.*/
271 lea SRC2,[SRC2+YSTRIDE*2]
272 /*#0 Lower src1+src2.*/
273 paddsw mm0,mm2
274 /*#0 Higher src1+src2.*/
275 paddsw mm1,mm3
276 /*#1 Copy src1.*/
277 movq mm2,mm4
278 /*#0 Build lo average.*/
279 psraw mm0,1
280 /*#1 Copy src2.*/
281 movq mm3,mm5
282 /*#1 Unpack lower src1.*/
283 punpcklbw mm4,mm7
284 /*#0 Build hi average.*/
285 psraw mm1,1
286 /*#1 Unpack higher src1.*/
287 punpckhbw mm2,mm7
288 /*#0 low+=residue.*/
289 paddsw mm0,[RESIDUE]
290 /*#1 Unpack lower src2.*/
291 punpcklbw mm5,mm7
292 /*#0 high+=residue.*/
293 paddsw mm1,[8+RESIDUE]
294 /*#1 Unpack higher src2.*/
295 punpckhbw mm3,mm7
296 /*#1 Lower src1+src2.*/
297 paddsw mm5,mm4
298 /*#0 Pack and saturate.*/
299 packuswb mm0,mm1
300 /*#1 Higher src1+src2.*/
301 paddsw mm3,mm2
302 /*#0 Write row.*/
303 movq [DST],mm0
304 /*#1 Build lo average.*/
305 psraw mm5,1
306 /*#1 Build hi average.*/
307 psraw mm3,1
308 /*#1 low+=residue.*/
309 paddsw mm5,[16+RESIDUE]
310 /*#1 high+=residue.*/
311 paddsw mm3,[24+RESIDUE]
312 /*#1 Pack and saturate.*/
313 packuswb mm5,mm3
314 /*#1 Write row ptr.*/
315 movq [DST+YSTRIDE],mm5
316 /*Advance residue ptr.*/
317 add RESIDUE,32
318 /*Advance dest ptr.*/
319 lea DST,[DST+YSTRIDE*2]
320 mov _dst,DST
321 mov _residue,RESIDUE
322 mov _src1,SRC1
323 mov _src2,SRC2
324 #undef SRC1
325 #undef SRC2
326 #undef YSTRIDE
327 #undef RESIDUE
328 #undef DST
329 }
330 }
331 }
332
oc_restore_fpu_mmx(void)333 void oc_restore_fpu_mmx(void){
334 __asm emms;
335 }
336
337 #endif
338