1;/****************************************************************************
2; *
3; *  XVID MPEG-4 VIDEO CODEC
4; *  - SSE2 optimized SAD operators -
5; *
6; *  Copyright(C) 2003-2010 Pascal Massimino <skal@planet-d.net>
7; *               2008-2010 Michael Militzer <michael@xvid.org>
8; *
9; *
10; *  This program is free software; you can redistribute it and/or modify it
11; *  under the terms of the GNU General Public License as published by
12; *  the Free Software Foundation; either version 2 of the License, or
13; *  (at your option) any later version.
14; *
15; *  This program is distributed in the hope that it will be useful,
16; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18; *  GNU General Public License for more details.
19; *
20; *  You should have received a copy of the GNU General Public License
21; *  along with this program; if not, write to the Free Software
22; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23; *
24; * $Id: sad_sse2.asm,v 1.21 2010-11-28 15:18:21 Isibaar Exp $
25; *
26; ***************************************************************************/
27
28%include "nasm.inc"
29
30;=============================================================================
31; Read only data
32;=============================================================================
33
34DATA
35
36ALIGN SECTION_ALIGN
37zero    times 4   dd 0
38
39ALIGN SECTION_ALIGN
40ones    times 8   dw 1
41
42ALIGN SECTION_ALIGN
43round32 times 4   dd 32
44
45;=============================================================================
46; Coeffs for MSE_H calculation
47;=============================================================================
48
49ALIGN SECTION_ALIGN
50iMask_Coeff:
51  dw     0, 29788, 32767, 20479, 13653, 8192, 6425, 5372,
52  dw 27306, 27306, 23405, 17246, 12603, 5650, 5461, 5958,
53  dw 23405, 25205, 20479, 13653,  8192, 5749, 4749, 5851,
54  dw 23405, 19275, 14894, 11299,  6425, 3766, 4096, 5285,
55  dw 18204, 14894,  8856,  5851,  4819, 3006, 3181, 4255,
56  dw 13653,  9362,  5958,  5120,  4045, 3151, 2900, 3562,
57  dw  6687,  5120,  4201,  3766,  3181, 2708, 2730, 3244,
58  dw  4551,  3562,  3449,  3344,  2926, 3277, 3181, 3310
59
60ALIGN SECTION_ALIGN
61Inv_iMask_Coeff:
62  dd    0,   155,   128,   328,   737,  2048,  3329,  4763,
63  dd  184,   184,   251,   462,   865,  4306,  4608,  3872,
64  dd  251,   216,   328,   737,  2048,  4159,  6094,  4014,
65  dd  251,   370,   620,  1076,  3329,  9688,  8192,  4920,
66  dd  415,   620,  1752,  4014,  5919, 15207, 13579,  7589,
67  dd  737,  1568,  3872,  5243,  8398, 13844, 16345, 10834,
68  dd 3073,  5243,  7787,  9688, 13579, 18741, 18433, 13057,
69  dd 6636, 10834, 11552, 12294, 16056, 12800, 13579, 12545
70
71ALIGN SECTION_ALIGN
72iCSF_Coeff:
73  dw 26353, 38331, 42164, 26353, 17568, 10541, 8268, 6912,
74  dw 35137, 35137, 30117, 22192, 16217,  7270, 7027, 7666,
75  dw 30117, 32434, 26353, 17568, 10541,  7397, 6111, 7529,
76  dw 30117, 24803, 19166, 14539,  8268,  4846, 5271, 6801,
77  dw 23425, 19166, 11396,  7529,  6201,  3868, 4094, 5476,
78  dw 17568, 12047,  7666,  6588,  5205,  4054, 3731, 4583,
79  dw  8605,  6588,  5406,  4846,  4094,  3485, 3514, 4175,
80  dw  5856,  4583,  4438,  4302,  3765,  4216, 4094, 4259
81
82ALIGN SECTION_ALIGN
83iCSF_Round:
84  dw 1, 1, 1, 1, 2, 3, 4, 5,
85  dw 1, 1, 1, 1, 2, 5, 5, 4,
86  dw 1, 1, 1, 2, 3, 4, 5, 4,
87  dw 1, 1, 2, 2, 4, 7, 6, 5,
88  dw 1, 2, 3, 4, 5, 8, 8, 6,
89  dw 2, 3, 4, 5, 6, 8, 9, 7,
90  dw 4, 5, 6, 7, 8, 9, 9, 8,
91  dw 6, 7, 7, 8, 9, 8, 8, 8
92
93
94;=============================================================================
95; Code
96;=============================================================================
97
98TEXT
99
100cglobal  sad16_sse2
101cglobal  dev16_sse2
102
103cglobal  sad16_sse3
104cglobal  dev16_sse3
105
106cglobal  sseh8_16bit_sse2
107cglobal  coeff8_energy_sse2
108cglobal  blocksum8_sse2
109
110;-----------------------------------------------------------------------------
111; uint32_t sad16_sse2 (const uint8_t * const cur, <- assumed aligned!
112;                      const uint8_t * const ref,
113;	                   const uint32_t stride,
114;                      const uint32_t /*ignored*/);
115;-----------------------------------------------------------------------------
116
117
118%macro SAD_16x16_SSE2 1
119  %1  xmm0, [TMP1]
120  %1  xmm1, [TMP1+TMP0]
121  lea TMP1,[TMP1+2*TMP0]
122  movdqa  xmm2, [_EAX]
123  movdqa  xmm3, [_EAX+TMP0]
124  lea _EAX,[_EAX+2*TMP0]
125  psadbw  xmm0, xmm2
126  paddusw xmm4,xmm0
127  psadbw  xmm1, xmm3
128  paddusw xmm4,xmm1
129%endmacro
130
131%macro SAD16_SSE2_SSE3 1
132  mov _EAX, prm1 ; cur (assumed aligned)
133  mov TMP1, prm2 ; ref
134  mov TMP0, prm3 ; stride
135
136  pxor xmm4, xmm4 ; accum
137
138  SAD_16x16_SSE2 %1
139  SAD_16x16_SSE2 %1
140  SAD_16x16_SSE2 %1
141  SAD_16x16_SSE2 %1
142  SAD_16x16_SSE2 %1
143  SAD_16x16_SSE2 %1
144  SAD_16x16_SSE2 %1
145  SAD_16x16_SSE2 %1
146
147  pshufd  xmm5, xmm4, 00000010b
148  paddusw xmm4, xmm5
149  pextrw  eax, xmm4, 0
150
151  ret
152%endmacro
153
154ALIGN SECTION_ALIGN
155sad16_sse2:
156  SAD16_SSE2_SSE3 movdqu
157ENDFUNC
158
159
160ALIGN SECTION_ALIGN
161sad16_sse3:
162  SAD16_SSE2_SSE3 lddqu
163ENDFUNC
164
165
166;-----------------------------------------------------------------------------
167; uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
168;-----------------------------------------------------------------------------
169
170%macro MEAN_16x16_SSE2 1  ; _EAX: src, TMP0:stride, mm7: zero or mean => mm6: result
171  %1 xmm0, [_EAX]
172  %1 xmm1, [_EAX+TMP0]
173  lea _EAX, [_EAX+2*TMP0]    ; + 2*stride
174  psadbw xmm0, xmm5
175  paddusw xmm4, xmm0
176  psadbw xmm1, xmm5
177  paddusw xmm4, xmm1
178%endmacro
179
180
181%macro MEAN16_SSE2_SSE3 1
182  mov _EAX, prm1   ; src
183  mov TMP0, prm2   ; stride
184
185  pxor xmm4, xmm4     ; accum
186  pxor xmm5, xmm5     ; zero
187
188  MEAN_16x16_SSE2 %1
189  MEAN_16x16_SSE2 %1
190  MEAN_16x16_SSE2 %1
191  MEAN_16x16_SSE2 %1
192
193  MEAN_16x16_SSE2 %1
194  MEAN_16x16_SSE2 %1
195  MEAN_16x16_SSE2 %1
196  MEAN_16x16_SSE2 %1
197
198  mov _EAX, prm1       ; src again
199
200  pshufd   xmm5, xmm4, 10b
201  paddusw  xmm5, xmm4
202  pxor     xmm4, xmm4     ; zero accum
203  psrlw    xmm5, 8        ; => Mean
204  pshuflw  xmm5, xmm5, 0  ; replicate Mean
205  packuswb xmm5, xmm5
206  pshufd   xmm5, xmm5, 00000000b
207
208  MEAN_16x16_SSE2 %1
209  MEAN_16x16_SSE2 %1
210  MEAN_16x16_SSE2 %1
211  MEAN_16x16_SSE2 %1
212
213  MEAN_16x16_SSE2 %1
214  MEAN_16x16_SSE2 %1
215  MEAN_16x16_SSE2 %1
216  MEAN_16x16_SSE2 %1
217
218  pshufd   xmm5, xmm4, 10b
219  paddusw  xmm5, xmm4
220  pextrw eax, xmm5, 0
221
222  ret
223%endmacro
224
225ALIGN SECTION_ALIGN
226dev16_sse2:
227  MEAN16_SSE2_SSE3 movdqu
228ENDFUNC
229
230ALIGN SECTION_ALIGN
231dev16_sse3:
232  MEAN16_SSE2_SSE3 lddqu
233ENDFUNC
234
235;-----------------------------------------------------------------------------
236; uint32_t coeff8_energy_sse2(const int16_t * dct);
237;-----------------------------------------------------------------------------
238
239%macro DCT_ENERGY_SSE2 4
240
241  movdqa  %1, [%3 + %4]
242  movdqa  %2, [%3 + %4 + 16]
243
244  psllw %1, 4
245  psllw %2, 4
246
247  pmulhw  %1, [iMask_Coeff + %4]
248  pmulhw  %2, [iMask_Coeff + %4 + 16]
249
250  pmaddwd %1, %1
251  pmaddwd %2, %2
252
253  paddd   %1, %2
254  psrld   %1, 3
255
256%endmacro
257
258ALIGN SECTION_ALIGN
259coeff8_energy_sse2:
260
261  mov TMP0, prm1  ; DCT_A
262
263  DCT_ENERGY_SSE2 xmm0, xmm1, TMP0,  0
264  DCT_ENERGY_SSE2 xmm1, xmm2, TMP0, 32
265
266  DCT_ENERGY_SSE2 xmm2, xmm3, TMP0, 64
267  DCT_ENERGY_SSE2 xmm3, xmm4, TMP0, 96
268
269  paddd xmm0, xmm1
270  paddd xmm2, xmm3
271
272  paddd xmm0, xmm2 ; A B C D
273
274  ; convolute
275  pshufd xmm1, xmm0, 238
276  paddd xmm0, xmm1
277
278  pshufd xmm2, xmm0, 85
279  paddd xmm0, xmm2
280
281  movd eax, xmm0
282
283  ret
284ENDFUNC
285
286;-----------------------------------------------------------------------------------
287; uint32_t mseh8_16bit_sse2(const int16_t * cur, const int16_t * ref, uint16_t mask)
288;-----------------------------------------------------------------------------------
289
290%macro SSEH_SSE2 4
291  movdqa xmm0, [%1 + %3]
292  movdqa xmm1, [%2 + %3]
293
294  movdqa xmm2, [%1 + %3 + 16]
295  movdqa xmm3, [%2 + %3 + 16]
296
297
298  movdqa xmm4, xmm7 ; MASK
299  movdqa xmm5, xmm7
300
301  psubsw xmm0, xmm1 ; A - B
302  psubsw xmm2, xmm3
303
304
305  ; ABS
306  pxor xmm1, xmm1
307  pxor xmm3, xmm3
308
309  pcmpgtw xmm1, xmm0
310  pcmpgtw xmm3, xmm2
311
312  pxor xmm0, xmm1     ; change sign if negative
313  pxor xmm2, xmm3     ;
314
315  psubw xmm0, xmm1    ; ABS (A - B)
316  psubw xmm2, xmm3    ; ABS (A - B)
317
318
319  movdqa xmm1, xmm7 ; MASK
320  movdqa xmm3, xmm7
321
322  pmaddwd xmm4, [Inv_iMask_Coeff + 2*(%3)]
323  pmaddwd xmm5, [Inv_iMask_Coeff + 2*(%3) + 16]
324
325  pmaddwd xmm1, [Inv_iMask_Coeff + 2*(%3) + 32]
326  pmaddwd xmm3, [Inv_iMask_Coeff + 2*(%3) + 48]
327
328  psllw xmm0, 4
329  psllw xmm2, 4
330
331  paddd xmm4, [round32]
332  paddd xmm5, [round32]
333
334  paddd xmm1, [round32]
335  paddd xmm3, [round32]
336
337  psrad xmm4, 7
338  psrad xmm5, 7
339
340  psrad xmm1, 7
341  psrad xmm3, 7
342
343  packssdw xmm4, xmm5 ; Thresh
344  packssdw xmm1, xmm3 ; Thresh
345
346
347  psubusw xmm0, xmm4 ; Decimate by masking effect
348  psubusw xmm2, xmm1
349
350  paddusw xmm0, [iCSF_Round + %3]
351  paddusw xmm2, [iCSF_Round + %3 + 16]
352
353  pmulhuw xmm0, [iCSF_Coeff + %3]
354  pmulhuw xmm2, [iCSF_Coeff + %3 + 16]
355
356  pmaddwd xmm0, xmm0
357  pmaddwd xmm2, xmm2
358
359  paddd xmm0, xmm2
360%endmacro
361
362
363ALIGN SECTION_ALIGN
364sseh8_16bit_sse2:
365
366  PUSH_XMM6_XMM7
367
368  mov TMP0, prm1  ; DCT_A
369  mov TMP1, prm2  ; DCT_B
370  mov _EAX, prm3  ; MASK
371
372  movd xmm7, eax
373  pshufd xmm7, xmm7, 0
374
375  SSEH_SSE2 TMP0, TMP1,   0, xmm7
376  movdqa xmm6, xmm0
377  SSEH_SSE2 TMP0, TMP1,  32, xmm7
378  paddd xmm6, xmm0
379  SSEH_SSE2 TMP0, TMP1,  64, xmm7
380  paddd xmm6, xmm0
381  SSEH_SSE2 TMP0, TMP1,  96, xmm7
382  paddd xmm6, xmm0
383
384  ; convolute
385  pshufd xmm1, xmm6, 238
386  paddd xmm6, xmm1
387
388  pshufd xmm2, xmm6, 85
389  paddd xmm6, xmm2
390
391
392  movd eax, xmm6
393
394  POP_XMM6_XMM7
395  ret
396ENDFUNC
397
398;--------------------------------------------------------------------------------------------
399; uint32_t blocksum8_c(const int8_t * cur, int stride, uint16_t sums[4], uint32_t squares[4])
400;--------------------------------------------------------------------------------------------
401
402%macro BLOCKSUM_SSE2 3
403  movq xmm0, [%1       ] ; 0 0 B A
404  movq xmm2, [%1 +   %2] ; 0 0 B A
405  movq xmm1, [%1 + 2*%2]
406  movq xmm3, [%1 +   %3]
407
408  punpckldq xmm0, xmm2 ; B B A A
409  punpckldq xmm1, xmm3 ; B B A A
410
411  movdqa xmm2, xmm0
412  movdqa xmm3, xmm1
413
414  psadbw xmm0, xmm7 ; 000b000a
415  psadbw xmm1, xmm7
416
417  movdqa xmm4, xmm2
418  movdqa xmm5, xmm3
419
420  punpcklbw xmm2, xmm7 ; aaaaaaaa
421  punpcklbw xmm3, xmm7
422
423  punpckhbw xmm4, xmm7 ; bbbbbbbb
424  punpckhbw xmm5, xmm7
425
426  pmaddwd xmm2, xmm2 ; a*a+a*a a*a+a*a a*a+a*a a*a+a*a
427  pmaddwd xmm3, xmm3
428
429  pmaddwd xmm4, xmm4 ; b*b+b*b b*b+b*b b*b+b*b b*b+b*b
430  pmaddwd xmm5, xmm5
431
432  paddd xmm2, xmm3
433  paddd xmm4, xmm5
434
435  movdqa xmm3, xmm2
436  punpckldq xmm2, xmm4 ; BABA
437  punpckhdq xmm3, xmm4 ; BABA
438
439  paddd xmm2, xmm3
440
441  lea %1, [%1 + 4*%2]
442
443  movdqa xmm4, xmm2
444  punpckhqdq xmm4, xmm7 ;
445
446  paddd xmm2, xmm4
447
448  ;
449  movq xmm3, [%1       ] ; 0 0 D C
450  movq xmm5, [%1 +   %2] ; 0 0 D C
451  movq xmm4, [%1 + 2*%2]
452  movq xmm6, [%1 +   %3]
453
454  punpckldq xmm3, xmm5 ; D D C C
455  punpckldq xmm4, xmm6 ; D D C C
456
457  movdqa xmm5, xmm3
458  movdqa xmm6, xmm4
459
460  psadbw xmm3, xmm7 ; 000d000c
461  psadbw xmm4, xmm7
462
463  packssdw xmm0, xmm3 ; 0d0c0b0a
464  packssdw xmm1, xmm4 ;
465
466  paddusw  xmm0, xmm1
467  packssdw xmm0, xmm7 ; 0000dcba
468
469
470  movdqa xmm3, xmm5
471  movdqa xmm4, xmm6
472
473  punpcklbw xmm3, xmm7
474  punpcklbw xmm4, xmm7
475
476  punpckhbw xmm5, xmm7
477  punpckhbw xmm6, xmm7
478
479  pmaddwd xmm3, xmm3 ; C*C+C*C
480  pmaddwd xmm4, xmm4
481
482  pmaddwd xmm5, xmm5 ; D*D+D*D
483  pmaddwd xmm6, xmm6
484
485  paddd xmm3, xmm4
486  paddd xmm5, xmm6
487
488  movdqa xmm1, xmm3
489  punpckldq xmm3, xmm5 ; DCDC
490  punpckhdq xmm1, xmm5 ; DCDC
491
492  paddd xmm3, xmm1
493
494  movdqa xmm4, xmm3
495  punpckhqdq xmm4, xmm7 ;
496
497  paddd xmm3, xmm4
498  punpcklqdq xmm2, xmm3
499%endmacro
500
501
502ALIGN SECTION_ALIGN
503blocksum8_sse2:
504
505  PUSH_XMM6_XMM7
506
507  mov TMP0, prm1  ; cur
508  mov TMP1, prm2  ; stride
509  mov _EAX, prm3  ; sums
510
511  push _EBP
512  lea _EBP, [TMP1 + 2*TMP1]
513
514  pxor xmm7, xmm7
515
516  BLOCKSUM_SSE2 TMP0, TMP1, _EBP
517
518  pop _EBP
519  mov TMP0, prm4  ; squares
520
521  movq [_EAX], xmm0   ; sums of the 4x4 sub-blocks
522  movdqa [TMP0], xmm2 ; squares of the 4x4 sub-blocks
523
524  pmaddwd xmm0, [ones]
525  packssdw xmm0, xmm7
526
527  pmaddwd xmm0, [ones]
528  movd eax, xmm0
529
530  POP_XMM6_XMM7
531  ret
532ENDFUNC
533
534NON_EXEC_STACK
535