1;/****************************************************************************
2; *
3; *  XVID MPEG-4 VIDEO CODEC
4; *  - 8<->16 bit transfer functions -
5; *
6; *  Copyright (C) 2002 Jaan Kalda
7; *
8; *  This program is free software ; you can redistribute it and/or modify
9; *  it under the terms of the GNU General Public License as published by
10; *  the Free Software Foundation ; either version 2 of the License, or
11; *  (at your option) any later version.
12; *
13; *  This program is distributed in the hope that it will be useful,
14; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
15; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16; *  GNU General Public License for more details.
17; *
18; *  You should have received a copy of the GNU General Public License
19; *  along with this program ; if not, write to the Free Software
20; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21; *
22; * $Id: mem_transfer_3dne.asm,v 1.13 2009-09-16 17:07:58 Isibaar Exp $
23; *
24; ***************************************************************************/
25
26; these 3dne functions are compatible with iSSE, but are optimized specifically
27; for K7 pipelines
28
29%include "nasm.inc"
30
31;=============================================================================
32; Read only data
33;=============================================================================
34
35DATA
36
37ALIGN SECTION_ALIGN
38mm_zero:
39	dd 0,0
40;=============================================================================
41; Macros
42;=============================================================================
43
44%ifdef ARCH_IS_X86_64
45%define nop4
46%else
47%macro nop4 0
48	db 08Dh, 074h, 026h, 0
49%endmacro
50%endif
51
52;=============================================================================
53; Code
54;=============================================================================
55
56TEXT
57
58cglobal transfer_8to16copy_3dne
59cglobal transfer_16to8copy_3dne
60cglobal transfer_8to16sub_3dne
61cglobal transfer_8to16subro_3dne
62cglobal transfer_8to16sub2_3dne
63cglobal transfer_16to8add_3dne
64cglobal transfer8x8_copy_3dne
65cglobal transfer8x4_copy_3dne
66
67;-----------------------------------------------------------------------------
68;
69; void transfer_8to16copy_3dne(int16_t * const dst,
70;							const uint8_t * const src,
71;							uint32_t stride);
72;
73;-----------------------------------------------------------------------------
74
75ALIGN SECTION_ALIGN
76transfer_8to16copy_3dne:
77
78  mov _EAX, prm2 ; Src
79  mov TMP1, prm3 ; Stride
80  mov TMP0, prm1 ; Dst
81  punpcklbw mm0, [byte _EAX]
82  punpcklbw mm1, [_EAX+4]
83  movq mm2, [_EAX+TMP1]
84  movq mm3, [_EAX+TMP1]
85  pxor mm7, mm7
86  lea _EAX, [_EAX+2*TMP1]
87  punpcklbw mm2, mm7
88  punpckhbw mm3, mm7
89  psrlw mm0, 8
90  psrlw mm1, 8
91  punpcklbw mm4, [_EAX]
92  punpcklbw mm5, [_EAX+TMP1+4]
93  movq [byte TMP0+0*64], mm0
94  movq [TMP0+0*64+8], mm1
95  punpcklbw mm6, [_EAX+TMP1]
96  punpcklbw mm7, [_EAX+4]
97  lea _EAX, [byte _EAX+2*TMP1]
98  psrlw mm4, 8
99  psrlw mm5, 8
100  punpcklbw mm0, [_EAX]
101  punpcklbw mm1, [_EAX+TMP1+4]
102  movq [TMP0+0*64+16], mm2
103  movq [TMP0+0*64+24], mm3
104  psrlw mm6, 8
105  psrlw mm7, 8
106  punpcklbw mm2, [_EAX+TMP1]
107  punpcklbw mm3, [_EAX+4]
108  lea _EAX, [byte _EAX+2*TMP1]
109  movq [byte TMP0+0*64+32], mm4
110  movq [TMP0+0*64+56], mm5
111  psrlw mm0, 8
112  psrlw mm1, 8
113  punpcklbw mm4, [_EAX]
114  punpcklbw mm5, [_EAX+TMP1+4]
115  movq [byte TMP0+0*64+48], mm6
116  movq [TMP0+0*64+40], mm7
117  psrlw mm2, 8
118  psrlw mm3, 8
119  punpcklbw mm6, [_EAX+TMP1]
120  punpcklbw mm7, [_EAX+4]
121  movq [byte TMP0+1*64], mm0
122  movq [TMP0+1*64+24], mm1
123  psrlw mm4, 8
124  psrlw mm5, 8
125  movq [TMP0+1*64+16], mm2
126  movq [TMP0+1*64+8], mm3
127  psrlw mm6, 8
128  psrlw mm7, 8
129  movq [byte TMP0+1*64+32], mm4
130  movq [TMP0+1*64+56], mm5
131  movq [byte TMP0+1*64+48], mm6
132  movq [TMP0+1*64+40], mm7
133  ret
134ENDFUNC
135
136
137;-----------------------------------------------------------------------------
138;
139; void transfer_16to8copy_3dne(uint8_t * const dst,
140;							const int16_t * const src,
141;							uint32_t stride);
142;
143;-----------------------------------------------------------------------------
144
145ALIGN SECTION_ALIGN
146transfer_16to8copy_3dne:
147
148  mov _EAX, prm2 ; Src
149  mov TMP0, prm1 ; Dst
150  mov TMP1, prm3 ; Stride
151
152  movq mm0, [byte _EAX+0*32]
153  packuswb mm0, [_EAX+0*32+8]
154  movq mm1, [_EAX+0*32+16]
155  packuswb mm1, [_EAX+0*32+24]
156  movq mm5, [_EAX+2*32+16]
157  movq mm2, [_EAX+1*32]
158  packuswb mm2, [_EAX+1*32+8]
159  movq mm3, [_EAX+1*32+16]
160  packuswb mm3, [_EAX+1*32+24]
161  movq mm6, [_EAX+3*32]
162  movq mm4, [_EAX+2*32]
163  packuswb mm4, [_EAX+2*32+8]
164  packuswb mm5, [_EAX+2*32+24]
165  movq mm7, [_EAX+3*32+16]
166  packuswb mm7, [_EAX+3*32+24]
167  packuswb mm6, [_EAX+3*32+8]
168  movq [TMP0], mm0
169  lea _EAX, [3*TMP1]
170  add _EAX, TMP0
171  movq [TMP0+TMP1], mm1
172  movq [TMP0+2*TMP1], mm2
173  movq [byte _EAX], mm3
174  movq [TMP0+4*TMP1], mm4
175  lea TMP0, [byte TMP0+4*TMP1]
176  movq [_EAX+2*TMP1], mm5
177  movq [_EAX+4*TMP1], mm7
178  movq [TMP0+2*TMP1], mm6
179  ret
180ENDFUNC
181
182;-----------------------------------------------------------------------------
183;
184; void transfer_8to16sub_3dne(int16_t * const dct,
185;				uint8_t * const cur,
186;				const uint8_t * const ref,
187;				const uint32_t stride);
188;
189;-----------------------------------------------------------------------------
190
191; when second argument == 1, reference (ebx) block is to current (_EAX)
192%macro COPY_8_TO_16_SUB 2
193  movq mm1, [_EAX]      ; cur
194  movq mm0, mm1
195  movq mm4, [TMP0]      ; ref
196  movq mm6, mm4
197%if %2 == 1
198  movq [_EAX], mm4
199%endif
200  punpckhbw mm1, mm7
201  punpckhbw mm6, mm7
202  punpcklbw mm4, mm7
203ALIGN SECTION_ALIGN
204  movq mm2, [byte _EAX+TMP1]
205  punpcklbw mm0, mm7
206  movq mm3, [byte _EAX+TMP1]
207  punpcklbw mm2, mm7
208  movq mm5, [byte TMP0+TMP1]  ; ref
209  punpckhbw mm3, mm7
210%if %2 == 1
211  movq [byte _EAX+TMP1], mm5
212%endif
213  psubsw mm1, mm6
214
215  movq mm6, mm5
216  psubsw mm0, mm4
217%if (%1 < 3)
218  lea _EAX,[_EAX+2*TMP1]
219  lea TMP0,[TMP0+2*TMP1]
220%else
221  mov TMP0,[_ESP]
222  add _ESP,byte PTR_SIZE
223%endif
224  movq [_EDI+%1*32+ 8], mm1
225  movq [byte _EDI+%1*32+ 0], mm0 ; dst
226  punpcklbw mm5, mm7
227  punpckhbw mm6, mm7
228  psubsw mm2, mm5
229  psubsw mm3, mm6
230  movq [_EDI+%1*32+16], mm2
231  movq [_EDI+%1*32+24], mm3
232%endmacro
233
234ALIGN SECTION_ALIGN
235transfer_8to16sub_3dne:
236  mov _EAX, prm2 ; Cur
237  mov TMP0, prm3 ; Ref
238  mov TMP1, prm4 ; Stride
239
240  push _EDI
241%ifdef ARCH_IS_X86_64
242  mov _EDI, prm1
243%else
244  mov _EDI, [_ESP+4+4] ; Dst
245%endif
246
247  pxor mm7, mm7
248  nop
249ALIGN SECTION_ALIGN
250  COPY_8_TO_16_SUB 0, 1
251  COPY_8_TO_16_SUB 1, 1
252  COPY_8_TO_16_SUB 2, 1
253  COPY_8_TO_16_SUB 3, 1
254  mov _EDI, TMP0
255  ret
256ENDFUNC
257
258ALIGN SECTION_ALIGN
259transfer_8to16subro_3dne:
260  mov _EAX, prm2 ; Cur
261  mov TMP0, prm3 ; Ref
262  mov TMP1, prm4 ; Stride
263
264  push _EDI
265%ifdef ARCH_IS_X86_64
266  mov _EDI, prm1
267%else
268  mov _EDI, [_ESP+4+ 4] ; Dst
269%endif
270
271  pxor mm7, mm7
272  nop
273ALIGN SECTION_ALIGN
274  COPY_8_TO_16_SUB 0, 0
275  COPY_8_TO_16_SUB 1, 0
276  COPY_8_TO_16_SUB 2, 0
277  COPY_8_TO_16_SUB 3, 0
278  mov _EDI, TMP0
279  ret
280ENDFUNC
281
282
283;-----------------------------------------------------------------------------
284;
285; void transfer_8to16sub2_3dne(int16_t * const dct,
286;				uint8_t * const cur,
287;				const uint8_t * ref1,
288;				const uint8_t * ref2,
289;				const uint32_t stride)
290;
291;-----------------------------------------------------------------------------
292
293%macro COPY_8_TO_16_SUB2_SSE 1
294  db 0Fh, 6Fh, 44h, 20h, 00  ;movq mm0, [byte _EAX]      ; cur
295  punpcklbw mm0, mm7
296  movq mm2, [byte _EAX+TMP1]
297  punpcklbw mm2, mm7
298  db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte _EAX]
299  punpckhbw mm1, mm7
300  movq mm3, [byte _EAX+TMP1]
301  punpckhbw mm3, mm7
302
303  movq mm4, [byte _EBX]      ; ref1
304  pavgb mm4, [byte _ESI]     ; ref2
305  movq [_EAX], mm4
306  movq mm5, [_EBX+TMP1]  ; ref
307  pavgb mm5, [_ESI+TMP1] ; ref2
308  movq [_EAX+TMP1], mm5
309  movq mm6, mm4
310  punpcklbw mm4, mm7
311  punpckhbw mm6, mm7
312%if (%1 < 3)
313  lea _ESI,[_ESI+2*TMP1]
314  lea _EBX,[byte _EBX+2*TMP1]
315  lea _EAX,[_EAX+2*TMP1]
316%else
317  mov _ESI,[_ESP]
318  mov _EBX,[_ESP+PTR_SIZE]
319  add _ESP,byte 2*PTR_SIZE
320%endif
321  psubsw mm0, mm4
322  psubsw mm1, mm6
323  movq mm6, mm5
324  punpcklbw mm5, mm7
325  punpckhbw mm6, mm7
326  psubsw mm2, mm5
327  psubsw mm3, mm6
328  movq [byte TMP0+%1*32+ 0], mm0 ; dst
329  movq [TMP0+%1*32+ 8], mm1
330  movq [TMP0+%1*32+16], mm2
331  movq [TMP0+%1*32+24], mm3
332%endmacro
333
334ALIGN SECTION_ALIGN
335transfer_8to16sub2_3dne:
336  mov TMP1d, prm5d ; Stride
337  mov TMP0, prm1   ; Dst
338  mov _EAX, prm2   ; Cur
339  push _EBX
340  lea _EBP,[byte _EBP]
341
342%ifdef ARCH_IS_X86_64
343  mov _EBX, prm3
344%else
345  mov _EBX, [_ESP+4+12] ; Ref1
346%endif
347
348  push _ESI
349  pxor mm7, mm7
350
351%ifdef ARCH_IS_X86_64
352  mov _ESI, prm4
353%else
354  mov _ESI, [_ESP+8+16] ; Ref2
355%endif
356
357  nop4
358  COPY_8_TO_16_SUB2_SSE 0
359  COPY_8_TO_16_SUB2_SSE 1
360  COPY_8_TO_16_SUB2_SSE 2
361  COPY_8_TO_16_SUB2_SSE 3
362
363  ret
364ENDFUNC
365
366
367;-----------------------------------------------------------------------------
368;
369; void transfer_16to8add_3dne(uint8_t * const dst,
370;						const int16_t * const src,
371;						uint32_t stride);
372;
373;-----------------------------------------------------------------------------
374
375%macro COPY_16_TO_8_ADD 1
376  movq mm0, [byte TMP0]
377  punpcklbw mm0, mm7
378  movq mm2, [byte TMP0+TMP1]
379  punpcklbw mm2, mm7
380  movq mm1, [byte TMP0]
381  punpckhbw mm1, mm7
382  movq mm3, [byte TMP0+TMP1]
383  punpckhbw mm3, mm7
384  paddsw mm0, [byte _EAX+%1*32+ 0]
385  paddsw mm1, [_EAX+%1*32+ 8]
386  paddsw mm2, [_EAX+%1*32+16]
387  paddsw mm3, [_EAX+%1*32+24]
388  packuswb mm0, mm1
389  packuswb mm2, mm3
390  mov _ESP, _ESP
391  movq [byte TMP0], mm0
392  movq [TMP0+TMP1], mm2
393%endmacro
394
395
396ALIGN SECTION_ALIGN
397transfer_16to8add_3dne:
398  mov TMP0, prm1 ; Dst
399  mov TMP1, prm3 ; Stride
400  mov _EAX, prm2 ; Src
401  pxor mm7, mm7
402  nop
403
404  COPY_16_TO_8_ADD 0
405  lea TMP0,[byte TMP0+2*TMP1]
406  COPY_16_TO_8_ADD 1
407  lea TMP0,[byte TMP0+2*TMP1]
408  COPY_16_TO_8_ADD 2
409  lea TMP0,[byte TMP0+2*TMP1]
410  COPY_16_TO_8_ADD 3
411  ret
412ENDFUNC
413
414;-----------------------------------------------------------------------------
415;
416; void transfer8x8_copy_3dne(uint8_t * const dst,
417;					const uint8_t * const src,
418;					const uint32_t stride);
419;
420;
421;-----------------------------------------------------------------------------
422
423%macro COPY_8_TO_8 0
424  movq mm0, [byte  _EAX]
425  movq mm1, [_EAX+TMP1]
426  movq [byte TMP0], mm0
427  lea _EAX,[byte _EAX+2*TMP1]
428  movq [TMP0+TMP1], mm1
429%endmacro
430
431ALIGN SECTION_ALIGN
432transfer8x8_copy_3dne:
433  mov _EAX, prm2 ; Src
434  mov TMP1, prm3 ; Stride
435  mov TMP0, prm1 ; Dst
436
437  COPY_8_TO_8
438  lea TMP0,[byte TMP0+2*TMP1]
439  COPY_8_TO_8
440  lea TMP0,[byte TMP0+2*TMP1]
441  COPY_8_TO_8
442  lea TMP0,[byte TMP0+2*TMP1]
443  COPY_8_TO_8
444  ret
445ENDFUNC
446
447;-----------------------------------------------------------------------------
448;
449; void transfer8x4_copy_3dne(uint8_t * const dst,
450;					const uint8_t * const src,
451;					const uint32_t stride);
452;
453;
454;-----------------------------------------------------------------------------
455
456ALIGN SECTION_ALIGN
457transfer8x4_copy_3dne:
458  mov _EAX, prm2 ; Src
459  mov TMP1, prm3 ; Stride
460  mov TMP0, prm1 ; Dst
461
462  COPY_8_TO_8
463  lea TMP0,[byte TMP0+2*TMP1]
464  COPY_8_TO_8
465  ret
466ENDFUNC
467
468NON_EXEC_STACK
469