1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_idct_dequant_0_2x_sse2
15; (
16;   short *qcoeff       - 0
17;   short *dequant      - 1
18;   unsigned char *dst  - 2
19;   int dst_stride      - 3
20; )
21
22SECTION .text
23
24global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
25sym(vp8_idct_dequant_0_2x_sse2):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 4
29    GET_GOT     rbx
30    ; end prolog
31
32        mov         rdx,            arg(1) ; dequant
33        mov         rax,            arg(0) ; qcoeff
34
35        movd        xmm4,           [rax]
36        movd        xmm5,           [rdx]
37
38        pinsrw      xmm4,           [rax+32],   4
39        pinsrw      xmm5,           [rdx],      4
40
41        pmullw      xmm4,           xmm5
42
43    ; Zero out xmm5, for use unpacking
44        pxor        xmm5,           xmm5
45
46    ; clear coeffs
47        movd        [rax],          xmm5
48        movd        [rax+32],       xmm5
49;pshufb
50        mov         rax,            arg(2) ; dst
51        movsxd      rdx,            dword ptr arg(3) ; dst_stride
52
53        pshuflw     xmm4,           xmm4,       00000000b
54        pshufhw     xmm4,           xmm4,       00000000b
55
56        lea         rcx,            [rdx + rdx*2]
57        paddw       xmm4,           [GLOBAL(fours)]
58
59        psraw       xmm4,           3
60
61        movq        xmm0,           [rax]
62        movq        xmm1,           [rax+rdx]
63        movq        xmm2,           [rax+2*rdx]
64        movq        xmm3,           [rax+rcx]
65
66        punpcklbw   xmm0,           xmm5
67        punpcklbw   xmm1,           xmm5
68        punpcklbw   xmm2,           xmm5
69        punpcklbw   xmm3,           xmm5
70
71
72    ; Add to predict buffer
73        paddw       xmm0,           xmm4
74        paddw       xmm1,           xmm4
75        paddw       xmm2,           xmm4
76        paddw       xmm3,           xmm4
77
78    ; pack up before storing
79        packuswb    xmm0,           xmm5
80        packuswb    xmm1,           xmm5
81        packuswb    xmm2,           xmm5
82        packuswb    xmm3,           xmm5
83
84    ; store blocks back out
85        movq        [rax],          xmm0
86        movq        [rax + rdx],    xmm1
87
88        lea         rax,            [rax + 2*rdx]
89
90        movq        [rax],          xmm2
91        movq        [rax + rdx],    xmm3
92
93    ; begin epilog
94    RESTORE_GOT
95    UNSHADOW_ARGS
96    pop         rbp
97    ret
98
99;void vp8_idct_dequant_full_2x_sse2
100; (
101;   short *qcoeff       - 0
102;   short *dequant      - 1
103;   unsigned char *dst  - 2
104;   int dst_stride      - 3
105; )
106global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
107sym(vp8_idct_dequant_full_2x_sse2):
108    push        rbp
109    mov         rbp, rsp
110    SHADOW_ARGS_TO_STACK 4
111    SAVE_XMM 7
112    GET_GOT     rbx
113    push        rsi
114    push        rdi
115    ; end prolog
116
117    ; special case when 2 blocks have 0 or 1 coeffs
118    ; dc is set as first coeff, so no need to load qcoeff
119        mov         rax,            arg(0) ; qcoeff
120        mov         rdx,            arg(1)  ; dequant
121        mov         rdi,            arg(2) ; dst
122
123
124    ; Zero out xmm7, for use unpacking
125        pxor        xmm7,           xmm7
126
127
128    ; note the transpose of xmm1 and xmm2, necessary for shuffle
129    ;   to spit out sensicle data
130        movdqa      xmm0,           [rax]
131        movdqa      xmm2,           [rax+16]
132        movdqa      xmm1,           [rax+32]
133        movdqa      xmm3,           [rax+48]
134
135    ; Clear out coeffs
136        movdqa      [rax],          xmm7
137        movdqa      [rax+16],       xmm7
138        movdqa      [rax+32],       xmm7
139        movdqa      [rax+48],       xmm7
140
141    ; dequantize qcoeff buffer
142        pmullw      xmm0,           [rdx]
143        pmullw      xmm2,           [rdx+16]
144        pmullw      xmm1,           [rdx]
145        pmullw      xmm3,           [rdx+16]
146        movsxd      rdx,            dword ptr arg(3) ; dst_stride
147
148    ; repack so block 0 row x and block 1 row x are together
149        movdqa      xmm4,           xmm0
150        punpckldq   xmm0,           xmm1
151        punpckhdq   xmm4,           xmm1
152
153        pshufd      xmm0,           xmm0,       11011000b
154        pshufd      xmm1,           xmm4,       11011000b
155
156        movdqa      xmm4,           xmm2
157        punpckldq   xmm2,           xmm3
158        punpckhdq   xmm4,           xmm3
159
160        pshufd      xmm2,           xmm2,       11011000b
161        pshufd      xmm3,           xmm4,       11011000b
162
163    ; first pass
164        psubw       xmm0,           xmm2        ; b1 = 0-2
165        paddw       xmm2,           xmm2        ;
166
167        movdqa      xmm5,           xmm1
168        paddw       xmm2,           xmm0        ; a1 = 0+2
169
170        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
171        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
172        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
173
174        movdqa      xmm7,           xmm3
175        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
176
177        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
178        psubw       xmm7,           xmm5        ; c1
179
180        movdqa      xmm5,           xmm1
181        movdqa      xmm4,           xmm3
182
183        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
184        paddw       xmm5,           xmm1
185
186        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
187        paddw       xmm3,           xmm4
188
189        paddw       xmm3,           xmm5        ; d1
190        movdqa      xmm6,           xmm2        ; a1
191
192        movdqa      xmm4,           xmm0        ; b1
193        paddw       xmm2,           xmm3        ;0
194
195        paddw       xmm4,           xmm7        ;1
196        psubw       xmm0,           xmm7        ;2
197
198        psubw       xmm6,           xmm3        ;3
199
200    ; transpose for the second pass
201        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
202        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
203        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
204
205        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
206        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
207        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
208
209
210        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
211        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
212        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
213
214        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
215        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
216        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
217
218
219        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
220        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
221        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
222
223        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
224        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
225        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
226
227        pshufd      xmm0,           xmm2,       11011000b
228        pshufd      xmm2,           xmm1,       11011000b
229
230        pshufd      xmm1,           xmm5,       11011000b
231        pshufd      xmm3,           xmm7,       11011000b
232
233    ; second pass
234        psubw       xmm0,           xmm2            ; b1 = 0-2
235        paddw       xmm2,           xmm2
236
237        movdqa      xmm5,           xmm1
238        paddw       xmm2,           xmm0            ; a1 = 0+2
239
240        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
241        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
242
243        movdqa      xmm7,           xmm3
244        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
245
246        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
247        psubw       xmm7,           xmm5            ; c1
248
249        movdqa      xmm5,           xmm1
250        movdqa      xmm4,           xmm3
251
252        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
253        paddw       xmm5,           xmm1
254
255        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
256        paddw       xmm3,           xmm4
257
258        paddw       xmm3,           xmm5            ; d1
259        paddw       xmm0,           [GLOBAL(fours)]
260
261        paddw       xmm2,           [GLOBAL(fours)]
262        movdqa      xmm6,           xmm2            ; a1
263
264        movdqa      xmm4,           xmm0            ; b1
265        paddw       xmm2,           xmm3            ;0
266
267        paddw       xmm4,           xmm7            ;1
268        psubw       xmm0,           xmm7            ;2
269
270        psubw       xmm6,           xmm3            ;3
271        psraw       xmm2,           3
272
273        psraw       xmm0,           3
274        psraw       xmm4,           3
275
276        psraw       xmm6,           3
277
278    ; transpose to save
279        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
280        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
281        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
282
283        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
284        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
285        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
286
287
288        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
289        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
290        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
291
292        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
293        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
294        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
295
296
297        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
298        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
299        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
300
301        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
302        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
303        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
304
305        pshufd      xmm0,           xmm2,       11011000b
306        pshufd      xmm2,           xmm1,       11011000b
307
308        pshufd      xmm1,           xmm5,       11011000b
309        pshufd      xmm3,           xmm7,       11011000b
310
311        pxor        xmm7,           xmm7
312
313    ; Load up predict blocks
314        movq        xmm4,           [rdi]
315        movq        xmm5,           [rdi+rdx]
316
317        punpcklbw   xmm4,           xmm7
318        punpcklbw   xmm5,           xmm7
319
320        paddw       xmm0,           xmm4
321        paddw       xmm1,           xmm5
322
323        movq        xmm4,           [rdi+2*rdx]
324        movq        xmm5,           [rdi+rcx]
325
326        punpcklbw   xmm4,           xmm7
327        punpcklbw   xmm5,           xmm7
328
329        paddw       xmm2,           xmm4
330        paddw       xmm3,           xmm5
331
332.finish:
333
334    ; pack up before storing
335        packuswb    xmm0,           xmm7
336        packuswb    xmm1,           xmm7
337        packuswb    xmm2,           xmm7
338        packuswb    xmm3,           xmm7
339
340    ; store blocks back out
341        movq        [rdi],          xmm0
342        movq        [rdi + rdx],    xmm1
343        movq        [rdi + rdx*2],  xmm2
344        movq        [rdi + rcx],    xmm3
345
346    ; begin epilog
347    pop         rdi
348    pop         rsi
349    RESTORE_GOT
350    RESTORE_XMM
351    UNSHADOW_ARGS
352    pop         rbp
353    ret
354
355;void vp8_idct_dequant_dc_0_2x_sse2
356; (
357;   short *qcoeff       - 0
358;   short *dequant      - 1
359;   unsigned char *dst  - 2
360;   int dst_stride      - 3
361;   short *dc           - 4
362; )
363global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
364sym(vp8_idct_dequant_dc_0_2x_sse2):
365    push        rbp
366    mov         rbp, rsp
367    SHADOW_ARGS_TO_STACK 5
368    GET_GOT     rbx
369    push        rdi
370    ; end prolog
371
372    ; special case when 2 blocks have 0 or 1 coeffs
373    ; dc is set as first coeff, so no need to load qcoeff
374        mov         rax,            arg(0) ; qcoeff
375
376        mov         rdi,            arg(2) ; dst
377        mov         rdx,            arg(4) ; dc
378
379    ; Zero out xmm5, for use unpacking
380        pxor        xmm5,           xmm5
381
382    ; load up 2 dc words here == 2*16 = doubleword
383        movd        xmm4,           [rdx]
384
385        movsxd      rdx,            dword ptr arg(3) ; dst_stride
386        lea         rcx, [rdx + rdx*2]
387    ; Load up predict blocks
388        movq        xmm0,           [rdi]
389        movq        xmm1,           [rdi+rdx*1]
390        movq        xmm2,           [rdi+rdx*2]
391        movq        xmm3,           [rdi+rcx]
392
393    ; Duplicate and expand dc across
394        punpcklwd   xmm4,           xmm4
395        punpckldq   xmm4,           xmm4
396
397    ; Rounding to dequant and downshift
398        paddw       xmm4,           [GLOBAL(fours)]
399        psraw       xmm4,           3
400
401    ; Predict buffer needs to be expanded from bytes to words
402        punpcklbw   xmm0,           xmm5
403        punpcklbw   xmm1,           xmm5
404        punpcklbw   xmm2,           xmm5
405        punpcklbw   xmm3,           xmm5
406
407    ; Add to predict buffer
408        paddw       xmm0,           xmm4
409        paddw       xmm1,           xmm4
410        paddw       xmm2,           xmm4
411        paddw       xmm3,           xmm4
412
413    ; pack up before storing
414        packuswb    xmm0,           xmm5
415        packuswb    xmm1,           xmm5
416        packuswb    xmm2,           xmm5
417        packuswb    xmm3,           xmm5
418
419    ; store blocks back out
420        movq        [rdi],          xmm0
421        movq        [rdi + rdx],    xmm1
422        movq        [rdi + rdx*2],  xmm2
423        movq        [rdi + rcx],    xmm3
424
425    ; begin epilog
426    pop         rdi
427    RESTORE_GOT
428    UNSHADOW_ARGS
429    pop         rbp
430    ret
431;void vp8_idct_dequant_dc_full_2x_sse2
432; (
433;   short *qcoeff       - 0
434;   short *dequant      - 1
435;   unsigned char *dst  - 2
436;   int dst_stride      - 3
437;   short *dc           - 4
438; )
439global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
440sym(vp8_idct_dequant_dc_full_2x_sse2):
441    push        rbp
442    mov         rbp, rsp
443    SHADOW_ARGS_TO_STACK 5
444    SAVE_XMM 7
445    GET_GOT     rbx
446    push        rdi
447    ; end prolog
448
449    ; special case when 2 blocks have 0 or 1 coeffs
450    ; dc is set as first coeff, so no need to load qcoeff
451        mov         rax,            arg(0) ; qcoeff
452        mov         rdx,            arg(1)  ; dequant
453
454        mov         rdi,            arg(2) ; dst
455
456    ; Zero out xmm7, for use unpacking
457        pxor        xmm7,           xmm7
458
459
460    ; note the transpose of xmm1 and xmm2, necessary for shuffle
461    ;   to spit out sensicle data
462        movdqa      xmm0,           [rax]
463        movdqa      xmm2,           [rax+16]
464        movdqa      xmm1,           [rax+32]
465        movdqa      xmm3,           [rax+48]
466
467    ; Clear out coeffs
468        movdqa      [rax],          xmm7
469        movdqa      [rax+16],       xmm7
470        movdqa      [rax+32],       xmm7
471        movdqa      [rax+48],       xmm7
472
473    ; dequantize qcoeff buffer
474        pmullw      xmm0,           [rdx]
475        pmullw      xmm2,           [rdx+16]
476        pmullw      xmm1,           [rdx]
477        pmullw      xmm3,           [rdx+16]
478
479    ; DC component
480        mov         rdx,            arg(4)
481
482    ; repack so block 0 row x and block 1 row x are together
483        movdqa      xmm4,           xmm0
484        punpckldq   xmm0,           xmm1
485        punpckhdq   xmm4,           xmm1
486
487        pshufd      xmm0,           xmm0,       11011000b
488        pshufd      xmm1,           xmm4,       11011000b
489
490        movdqa      xmm4,           xmm2
491        punpckldq   xmm2,           xmm3
492        punpckhdq   xmm4,           xmm3
493
494        pshufd      xmm2,           xmm2,       11011000b
495        pshufd      xmm3,           xmm4,       11011000b
496
497    ; insert DC component
498        pinsrw      xmm0,           [rdx],      0
499        pinsrw      xmm0,           [rdx+2],    4
500
501    ; first pass
502        psubw       xmm0,           xmm2        ; b1 = 0-2
503        paddw       xmm2,           xmm2        ;
504
505        movdqa      xmm5,           xmm1
506        paddw       xmm2,           xmm0        ; a1 = 0+2
507
508        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
509        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
510
511        movdqa      xmm7,           xmm3
512        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
513
514        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
515        psubw       xmm7,           xmm5        ; c1
516
517        movdqa      xmm5,           xmm1
518        movdqa      xmm4,           xmm3
519
520        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
521        paddw       xmm5,           xmm1
522
523        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
524        paddw       xmm3,           xmm4
525
526        paddw       xmm3,           xmm5        ; d1
527        movdqa      xmm6,           xmm2        ; a1
528
529        movdqa      xmm4,           xmm0        ; b1
530        paddw       xmm2,           xmm3        ;0
531
532        paddw       xmm4,           xmm7        ;1
533        psubw       xmm0,           xmm7        ;2
534
535        psubw       xmm6,           xmm3        ;3
536
537    ; transpose for the second pass
538        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
539        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
540        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
541
542        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
543        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
544        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
545
546
547        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
548        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
549        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
550
551        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
552        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
553        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
554
555
556        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
557        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
558        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
559
560        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
561        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
562        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
563
564        pshufd      xmm0,           xmm2,       11011000b
565        pshufd      xmm2,           xmm1,       11011000b
566
567        pshufd      xmm1,           xmm5,       11011000b
568        pshufd      xmm3,           xmm7,       11011000b
569
570    ; second pass
571        psubw       xmm0,           xmm2            ; b1 = 0-2
572        paddw       xmm2,           xmm2
573
574        movdqa      xmm5,           xmm1
575        paddw       xmm2,           xmm0            ; a1 = 0+2
576
577        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
578        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
579
580        movdqa      xmm7,           xmm3
581        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
582
583        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
584        psubw       xmm7,           xmm5            ; c1
585
586        movdqa      xmm5,           xmm1
587        movdqa      xmm4,           xmm3
588
589        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
590        paddw       xmm5,           xmm1
591
592        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
593        paddw       xmm3,           xmm4
594
595        paddw       xmm3,           xmm5            ; d1
596        paddw       xmm0,           [GLOBAL(fours)]
597
598        paddw       xmm2,           [GLOBAL(fours)]
599        movdqa      xmm6,           xmm2            ; a1
600
601        movdqa      xmm4,           xmm0            ; b1
602        paddw       xmm2,           xmm3            ;0
603
604        paddw       xmm4,           xmm7            ;1
605        psubw       xmm0,           xmm7            ;2
606
607        psubw       xmm6,           xmm3            ;3
608        psraw       xmm2,           3
609
610        psraw       xmm0,           3
611        psraw       xmm4,           3
612
613        psraw       xmm6,           3
614
615    ; transpose to save
616        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
617        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
618        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
619
620        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
621        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
622        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
623
624
625        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
626        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
627        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
628
629        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
630        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
631        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
632
633
634        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
635        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
636        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
637
638        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
639        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
640        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
641
642        pshufd      xmm0,           xmm2,       11011000b
643        pshufd      xmm2,           xmm1,       11011000b
644
645        pshufd      xmm1,           xmm5,       11011000b
646        pshufd      xmm3,           xmm7,       11011000b
647
648        pxor        xmm7,           xmm7
649
650    ; Load up predict blocks
651        movsxd      rdx,            dword ptr arg(3) ; dst_stride
652        movq        xmm4,           [rdi]
653        movq        xmm5,           [rdi+rdx]
654        lea         rcx,            [rdx + rdx*2]
655
656        punpcklbw   xmm4,           xmm7
657        punpcklbw   xmm5,           xmm7
658
659        paddw       xmm0,           xmm4
660        paddw       xmm1,           xmm5
661
662        movq        xmm4,           [rdi+rdx*2]
663        movq        xmm5,           [rdi+rcx]
664
665        punpcklbw   xmm4,           xmm7
666        punpcklbw   xmm5,           xmm7
667
668        paddw       xmm2,           xmm4
669        paddw       xmm3,           xmm5
670
671.finish:
672
673    ; pack up before storing
674        packuswb    xmm0,           xmm7
675        packuswb    xmm1,           xmm7
676        packuswb    xmm2,           xmm7
677        packuswb    xmm3,           xmm7
678
679    ; Load destination stride before writing out,
680    ;   doesn't need to persist
681        movsxd      rdx,            dword ptr arg(3) ; dst_stride
682
683    ; store blocks back out
684        movq        [rdi],          xmm0
685        movq        [rdi + rdx],    xmm1
686
687        lea         rdi,            [rdi + 2*rdx]
688
689        movq        [rdi],          xmm2
690        movq        [rdi + rdx],    xmm3
691
692
693    ; begin epilog
694    pop         rdi
695    RESTORE_GOT
696    RESTORE_XMM
697    UNSHADOW_ARGS
698    pop         rbp
699    ret
700
701SECTION_RODATA
702align 16
703fours:
704    times 8 dw 0x0004
705align 16
706x_s1sqr2:
707    times 8 dw 0x8A8C
708align 16
709x_c1sqr2less1:
710    times 8 dw 0x4E7B
711