1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17
18 /*MMX acceleration of fragment reconstruction for motion compensation.
19 Originally written by Rudolf Marek.
20 Additional optimization by Nils Pipenbrinck.
21 Note: Loops are unrolled for best performance.
22 The iteration each instruction belongs to is marked in the comments as #i.*/
23 #include <stddef.h>
24 #include "x86int.h"
25
26 #if defined(OC_X86_ASM)
27
28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
29 between rows.*/
30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
31 do{ \
32 const unsigned char *src; \
33 unsigned char *dst; \
34 src=(_src); \
35 dst=(_dst); \
36 __asm mov SRC,src \
37 __asm mov DST,dst \
38 __asm mov YSTRIDE,_ystride \
39 /*src+0*ystride*/ \
40 __asm movq mm0,[SRC] \
41 /*src+1*ystride*/ \
42 __asm movq mm1,[SRC+YSTRIDE] \
43 /*ystride3=ystride*3*/ \
44 __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
45 /*src+2*ystride*/ \
46 __asm movq mm2,[SRC+YSTRIDE*2] \
47 /*src+3*ystride*/ \
48 __asm movq mm3,[SRC+YSTRIDE3] \
49 /*dst+0*ystride*/ \
50 __asm movq [DST],mm0 \
51 /*dst+1*ystride*/ \
52 __asm movq [DST+YSTRIDE],mm1 \
53 /*Pointer to next 4.*/ \
54 __asm lea SRC,[SRC+YSTRIDE*4] \
55 /*dst+2*ystride*/ \
56 __asm movq [DST+YSTRIDE*2],mm2 \
57 /*dst+3*ystride*/ \
58 __asm movq [DST+YSTRIDE3],mm3 \
59 /*Pointer to next 4.*/ \
60 __asm lea DST,[DST+YSTRIDE*4] \
61 /*src+0*ystride*/ \
62 __asm movq mm0,[SRC] \
63 /*src+1*ystride*/ \
64 __asm movq mm1,[SRC+YSTRIDE] \
65 /*src+2*ystride*/ \
66 __asm movq mm2,[SRC+YSTRIDE*2] \
67 /*src+3*ystride*/ \
68 __asm movq mm3,[SRC+YSTRIDE3] \
69 /*dst+0*ystride*/ \
70 __asm movq [DST],mm0 \
71 /*dst+1*ystride*/ \
72 __asm movq [DST+YSTRIDE],mm1 \
73 /*dst+2*ystride*/ \
74 __asm movq [DST+YSTRIDE*2],mm2 \
75 /*dst+3*ystride*/ \
76 __asm movq [DST+YSTRIDE3],mm3 \
77 } \
78 while(0)
79
80 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
81 between rows.*/
oc_frag_copy_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride)82 void oc_frag_copy_mmx(unsigned char *_dst,
83 const unsigned char *_src,int _ystride){
84 #define SRC edx
85 #define DST eax
86 #define YSTRIDE ecx
87 #define YSTRIDE3 esi
88 OC_FRAG_COPY_MMX(_dst,_src,_ystride);
89 #undef SRC
90 #undef DST
91 #undef YSTRIDE
92 #undef YSTRIDE3
93 }
94
95 /*Copies the fragments specified by the lists of fragment indices from one
96 frame to another.
97 _dst_frame: The reference frame to copy to.
98 _src_frame: The reference frame to copy from.
99 _ystride: The row stride of the reference frames.
100 _fragis: A pointer to a list of fragment indices.
101 _nfragis: The number of fragment indices to copy.
102 _frag_buf_offs: The offsets of fragments in the reference frames.*/
oc_frag_copy_list_mmx(unsigned char * _dst_frame,const unsigned char * _src_frame,int _ystride,const ptrdiff_t * _fragis,ptrdiff_t _nfragis,const ptrdiff_t * _frag_buf_offs)103 void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
104 const unsigned char *_src_frame,int _ystride,
105 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
106 ptrdiff_t fragii;
107 for(fragii=0;fragii<_nfragis;fragii++){
108 ptrdiff_t frag_buf_off;
109 frag_buf_off=_frag_buf_offs[_fragis[fragii]];
110 #define SRC edx
111 #define DST eax
112 #define YSTRIDE ecx
113 #define YSTRIDE3 edi
114 OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
115 _src_frame+frag_buf_off,_ystride);
116 #undef SRC
117 #undef DST
118 #undef YSTRIDE
119 #undef YSTRIDE3
120 }
121 }
122
oc_frag_recon_intra_mmx(unsigned char * _dst,int _ystride,const ogg_int16_t * _residue)123 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
124 const ogg_int16_t *_residue){
125 __asm{
126 #define DST edx
127 #define DST4 esi
128 #define YSTRIDE eax
129 #define YSTRIDE3 edi
130 #define RESIDUE ecx
131 mov DST,_dst
132 mov YSTRIDE,_ystride
133 mov RESIDUE,_residue
134 lea DST4,[DST+YSTRIDE*4]
135 lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
136 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
137 pcmpeqw mm0,mm0
138 /*#0 Load low residue.*/
139 movq mm1,[0*8+RESIDUE]
140 /*#0 Load high residue.*/
141 movq mm2,[1*8+RESIDUE]
142 /*Set mm0 to 0x8000800080008000.*/
143 psllw mm0,15
144 /*#1 Load low residue.*/
145 movq mm3,[2*8+RESIDUE]
146 /*#1 Load high residue.*/
147 movq mm4,[3*8+RESIDUE]
148 /*Set mm0 to 0x0080008000800080.*/
149 psrlw mm0,8
150 /*#2 Load low residue.*/
151 movq mm5,[4*8+RESIDUE]
152 /*#2 Load high residue.*/
153 movq mm6,[5*8+RESIDUE]
154 /*#0 Bias low residue.*/
155 paddsw mm1,mm0
156 /*#0 Bias high residue.*/
157 paddsw mm2,mm0
158 /*#0 Pack to byte.*/
159 packuswb mm1,mm2
160 /*#1 Bias low residue.*/
161 paddsw mm3,mm0
162 /*#1 Bias high residue.*/
163 paddsw mm4,mm0
164 /*#1 Pack to byte.*/
165 packuswb mm3,mm4
166 /*#2 Bias low residue.*/
167 paddsw mm5,mm0
168 /*#2 Bias high residue.*/
169 paddsw mm6,mm0
170 /*#2 Pack to byte.*/
171 packuswb mm5,mm6
172 /*#0 Write row.*/
173 movq [DST],mm1
174 /*#1 Write row.*/
175 movq [DST+YSTRIDE],mm3
176 /*#2 Write row.*/
177 movq [DST+YSTRIDE*2],mm5
178 /*#3 Load low residue.*/
179 movq mm1,[6*8+RESIDUE]
180 /*#3 Load high residue.*/
181 movq mm2,[7*8+RESIDUE]
182 /*#4 Load high residue.*/
183 movq mm3,[8*8+RESIDUE]
184 /*#4 Load high residue.*/
185 movq mm4,[9*8+RESIDUE]
186 /*#5 Load high residue.*/
187 movq mm5,[10*8+RESIDUE]
188 /*#5 Load high residue.*/
189 movq mm6,[11*8+RESIDUE]
190 /*#3 Bias low residue.*/
191 paddsw mm1,mm0
192 /*#3 Bias high residue.*/
193 paddsw mm2,mm0
194 /*#3 Pack to byte.*/
195 packuswb mm1,mm2
196 /*#4 Bias low residue.*/
197 paddsw mm3,mm0
198 /*#4 Bias high residue.*/
199 paddsw mm4,mm0
200 /*#4 Pack to byte.*/
201 packuswb mm3,mm4
202 /*#5 Bias low residue.*/
203 paddsw mm5,mm0
204 /*#5 Bias high residue.*/
205 paddsw mm6,mm0
206 /*#5 Pack to byte.*/
207 packuswb mm5,mm6
208 /*#3 Write row.*/
209 movq [DST+YSTRIDE3],mm1
210 /*#4 Write row.*/
211 movq [DST4],mm3
212 /*#5 Write row.*/
213 movq [DST4+YSTRIDE],mm5
214 /*#6 Load low residue.*/
215 movq mm1,[12*8+RESIDUE]
216 /*#6 Load high residue.*/
217 movq mm2,[13*8+RESIDUE]
218 /*#7 Load low residue.*/
219 movq mm3,[14*8+RESIDUE]
220 /*#7 Load high residue.*/
221 movq mm4,[15*8+RESIDUE]
222 /*#6 Bias low residue.*/
223 paddsw mm1,mm0
224 /*#6 Bias high residue.*/
225 paddsw mm2,mm0
226 /*#6 Pack to byte.*/
227 packuswb mm1,mm2
228 /*#7 Bias low residue.*/
229 paddsw mm3,mm0
230 /*#7 Bias high residue.*/
231 paddsw mm4,mm0
232 /*#7 Pack to byte.*/
233 packuswb mm3,mm4
234 /*#6 Write row.*/
235 movq [DST4+YSTRIDE*2],mm1
236 /*#7 Write row.*/
237 movq [DST4+YSTRIDE3],mm3
238 #undef DST
239 #undef DST4
240 #undef YSTRIDE
241 #undef YSTRIDE3
242 #undef RESIDUE
243 }
244 }
245
oc_frag_recon_inter_mmx(unsigned char * _dst,const unsigned char * _src,int _ystride,const ogg_int16_t * _residue)246 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
247 int _ystride,const ogg_int16_t *_residue){
248 int i;
249 /*Zero mm0.*/
250 __asm pxor mm0,mm0;
251 for(i=4;i-->0;){
252 __asm{
253 #define DST edx
254 #define SRC ecx
255 #define YSTRIDE edi
256 #define RESIDUE eax
257 mov DST,_dst
258 mov SRC,_src
259 mov YSTRIDE,_ystride
260 mov RESIDUE,_residue
261 /*#0 Load source.*/
262 movq mm3,[SRC]
263 /*#1 Load source.*/
264 movq mm7,[SRC+YSTRIDE]
265 /*#0 Get copy of src.*/
266 movq mm4,mm3
267 /*#0 Expand high source.*/
268 punpckhbw mm4,mm0
269 /*#0 Expand low source.*/
270 punpcklbw mm3,mm0
271 /*#0 Add residue high.*/
272 paddsw mm4,[8+RESIDUE]
273 /*#1 Get copy of src.*/
274 movq mm2,mm7
275 /*#0 Add residue low.*/
276 paddsw mm3,[RESIDUE]
277 /*#1 Expand high source.*/
278 punpckhbw mm2,mm0
279 /*#0 Pack final row pixels.*/
280 packuswb mm3,mm4
281 /*#1 Expand low source.*/
282 punpcklbw mm7,mm0
283 /*#1 Add residue low.*/
284 paddsw mm7,[16+RESIDUE]
285 /*#1 Add residue high.*/
286 paddsw mm2,[24+RESIDUE]
287 /*Advance residue.*/
288 lea RESIDUE,[32+RESIDUE]
289 /*#1 Pack final row pixels.*/
290 packuswb mm7,mm2
291 /*Advance src.*/
292 lea SRC,[SRC+YSTRIDE*2]
293 /*#0 Write row.*/
294 movq [DST],mm3
295 /*#1 Write row.*/
296 movq [DST+YSTRIDE],mm7
297 /*Advance dst.*/
298 lea DST,[DST+YSTRIDE*2]
299 mov _residue,RESIDUE
300 mov _dst,DST
301 mov _src,SRC
302 #undef DST
303 #undef SRC
304 #undef YSTRIDE
305 #undef RESIDUE
306 }
307 }
308 }
309
oc_frag_recon_inter2_mmx(unsigned char * _dst,const unsigned char * _src1,const unsigned char * _src2,int _ystride,const ogg_int16_t * _residue)310 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
311 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
312 int i;
313 /*Zero mm7.*/
314 __asm pxor mm7,mm7;
315 for(i=4;i-->0;){
316 __asm{
317 #define SRC1 ecx
318 #define SRC2 edi
319 #define YSTRIDE esi
320 #define RESIDUE edx
321 #define DST eax
322 mov YSTRIDE,_ystride
323 mov DST,_dst
324 mov RESIDUE,_residue
325 mov SRC1,_src1
326 mov SRC2,_src2
327 /*#0 Load src1.*/
328 movq mm0,[SRC1]
329 /*#0 Load src2.*/
330 movq mm2,[SRC2]
331 /*#0 Copy src1.*/
332 movq mm1,mm0
333 /*#0 Copy src2.*/
334 movq mm3,mm2
335 /*#1 Load src1.*/
336 movq mm4,[SRC1+YSTRIDE]
337 /*#0 Unpack lower src1.*/
338 punpcklbw mm0,mm7
339 /*#1 Load src2.*/
340 movq mm5,[SRC2+YSTRIDE]
341 /*#0 Unpack higher src1.*/
342 punpckhbw mm1,mm7
343 /*#0 Unpack lower src2.*/
344 punpcklbw mm2,mm7
345 /*#0 Unpack higher src2.*/
346 punpckhbw mm3,mm7
347 /*Advance src1 ptr.*/
348 lea SRC1,[SRC1+YSTRIDE*2]
349 /*Advance src2 ptr.*/
350 lea SRC2,[SRC2+YSTRIDE*2]
351 /*#0 Lower src1+src2.*/
352 paddsw mm0,mm2
353 /*#0 Higher src1+src2.*/
354 paddsw mm1,mm3
355 /*#1 Copy src1.*/
356 movq mm2,mm4
357 /*#0 Build lo average.*/
358 psraw mm0,1
359 /*#1 Copy src2.*/
360 movq mm3,mm5
361 /*#1 Unpack lower src1.*/
362 punpcklbw mm4,mm7
363 /*#0 Build hi average.*/
364 psraw mm1,1
365 /*#1 Unpack higher src1.*/
366 punpckhbw mm2,mm7
367 /*#0 low+=residue.*/
368 paddsw mm0,[RESIDUE]
369 /*#1 Unpack lower src2.*/
370 punpcklbw mm5,mm7
371 /*#0 high+=residue.*/
372 paddsw mm1,[8+RESIDUE]
373 /*#1 Unpack higher src2.*/
374 punpckhbw mm3,mm7
375 /*#1 Lower src1+src2.*/
376 paddsw mm5,mm4
377 /*#0 Pack and saturate.*/
378 packuswb mm0,mm1
379 /*#1 Higher src1+src2.*/
380 paddsw mm3,mm2
381 /*#0 Write row.*/
382 movq [DST],mm0
383 /*#1 Build lo average.*/
384 psraw mm5,1
385 /*#1 Build hi average.*/
386 psraw mm3,1
387 /*#1 low+=residue.*/
388 paddsw mm5,[16+RESIDUE]
389 /*#1 high+=residue.*/
390 paddsw mm3,[24+RESIDUE]
391 /*#1 Pack and saturate.*/
392 packuswb mm5,mm3
393 /*#1 Write row ptr.*/
394 movq [DST+YSTRIDE],mm5
395 /*Advance residue ptr.*/
396 add RESIDUE,32
397 /*Advance dest ptr.*/
398 lea DST,[DST+YSTRIDE*2]
399 mov _dst,DST
400 mov _residue,RESIDUE
401 mov _src1,SRC1
402 mov _src2,SRC2
403 #undef SRC1
404 #undef SRC2
405 #undef YSTRIDE
406 #undef RESIDUE
407 #undef DST
408 }
409 }
410 }
411
oc_restore_fpu_mmx(void)412 void oc_restore_fpu_mmx(void){
413 __asm emms;
414 }
415
416 #endif
417