1;*****************************************************************************
2;* x86-optimized functions for fspp filter
3;*
4;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or modify
10;* it under the terms of the GNU General Public License as published by
11;* the Free Software Foundation; either version 2 of the License, or
12;* (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17;* GNU General Public License for more details.
18;*
19;* You should have received a copy of the GNU General Public License along
20;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
21;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
29              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
30              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
31             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
32pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
33pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
34pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
35pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
36pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
37pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
38pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
39pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
40pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
41pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
42pw_4:    times 4 dw 4
43pw_2:    times 4 dw 2
44
45SECTION .text
46
47%define DCTSIZE 8
48
49INIT_MMX mmx
50
51;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
52;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
53;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
54%if ARCH_X86_64
55cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
56%else
57cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
58%define dst_strideq r2m
59%define src_strideq r3m
60    mov       widthq, r4m
61    mov       dither_heightq, r5m
62    mov       ditherq, r6m ; log2_scale
63%endif
64    add       widthq, 7
65    mov       tmpq, src_strideq
66    and       widthq, ~7
67    sub       dst_strideq, widthq
68    movd      m5, ditherd ; log2_scale
69    xor       ditherq, -1 ; log2_scale
70    mov       tmp2q, tmpq
71    add       ditherq, 7 ; log2_scale
72    neg       tmpq
73    sub       tmp2q, widthq
74    movd      m2, ditherd ; log2_scale
75    add       tmp2q, tmp2q
76    lea       ditherq, [pb_dither]
77    mov       src_strideq, tmp2q
78    shl       tmpq, 4
79    lea       dither_heightq, [ditherq+dither_heightq*8]
80    pxor      m7, m7
81
82.loop_height:
83    movq      m3, [ditherq]
84    movq      m4, m3
85    punpcklbw m3, m7
86    punpckhbw m4, m7
87    mov       tmp2q, widthq
88    psraw     m3, m5
89    psraw     m4, m5
90
91.loop_width:
92    movq      [srcq+tmpq], m7
93    movq      m0, [srcq]
94    movq      m1, [srcq+8]
95    movq      [srcq+tmpq+8], m7
96    paddw     m0, m3
97    paddw     m1, m4
98    movq      [srcq], m7
99    psraw     m0, m2
100    psraw     m1, m2
101    movq      [srcq+8], m7
102    packuswb  m0, m1
103    add       srcq, 16
104    movq      [dstq], m0
105    add       dstq, 8
106    sub       tmp2q, 8
107    jg .loop_width
108
109    add       srcq, src_strideq
110    add       ditherq, 8
111    add       dstq, dst_strideq
112    cmp       ditherq, dither_heightq
113    jl .loop_height
114    RET
115
116;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
117;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
118;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
119%if ARCH_X86_64
120cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
121%else
122cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
123%define dst_strideq r2m
124%define src_strideq r3m
125    mov       dstq, dstm
126    mov       srcq, srcm
127    mov       widthq, r4m
128    mov       dither_heightq, r5m
129    mov       ditherq, r6m ; log2_scale
130%endif
131    add       widthq, 7
132    mov       tmpq, src_strideq
133    and       widthq, ~7
134    sub       dst_strideq, widthq
135    movd      m5, ditherd ; log2_scale
136    xor       ditherq, -1 ; log2_scale
137    mov       tmp2q, tmpq
138    add       ditherq, 7 ; log2_scale
139    sub       tmp2q, widthq
140    movd      m2, ditherd ; log2_scale
141    add       tmp2q, tmp2q
142    lea       ditherq, [pb_dither]
143    mov       src_strideq, tmp2q
144    shl       tmpq, 5
145    lea       dither_heightq, [ditherq+dither_heightq*8]
146    pxor      m7, m7
147
148.loop_height:
149    movq      m3, [ditherq]
150    movq      m4, m3
151    punpcklbw m3, m7
152    punpckhbw m4, m7
153    mov       tmp2q,widthq
154    psraw     m3, m5
155    psraw     m4, m5
156
157.loop_width:
158    movq      m0, [srcq]
159    movq      m1, [srcq+8]
160    paddw     m0, m3
161    paddw     m0, [srcq+tmpq]
162    paddw     m1, m4
163    movq      m6, [srcq+tmpq+8]
164    movq      [srcq+tmpq], m7
165    psraw     m0, m2
166    paddw     m1, m6
167    movq      [srcq+tmpq+8], m7
168    psraw     m1, m2
169    packuswb  m0, m1
170    movq      [dstq], m0
171    add       srcq, 16
172    add       dstq, 8
173    sub       tmp2q, 8
174    jg .loop_width
175
176    add       srcq, src_strideq
177    add       ditherq, 8
178    add       dstq, dst_strideq
179    cmp       ditherq, dither_heightq
180    jl .loop_height
181    RET
182
183;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
184cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
185    movd      m7, qd
186    movq      m0, [thrnq]
187    punpcklwd m7, m7
188    movq      m1, [thrnq+8]
189    punpckldq m7, m7
190    pmullw    m0, m7
191    movq      m2, [thrnq+8*2]
192    pmullw    m1, m7
193    movq      m3, [thrnq+8*3]
194    pmullw    m2, m7
195    movq      [thrq], m0
196    movq      m4, [thrnq+8*4]
197    pmullw    m3, m7
198    movq      [thrq+8], m1
199    movq      m5, [thrnq+8*5]
200    pmullw    m4, m7
201    movq      [thrq+8*2], m2
202    movq      m6, [thrnq+8*6]
203    pmullw    m5, m7
204    movq      [thrq+8*3], m3
205    movq      m0, [thrnq+8*7]
206    pmullw    m6, m7
207    movq      [thrq+8*4], m4
208    movq      m1, [thrnq+8*7+8]
209    pmullw    m0, m7
210    movq      [thrq+8*5], m5
211    movq      m2, [thrnq+8*7+8*2]
212    pmullw    m1, m7
213    movq      [thrq+8*6], m6
214    movq      m3, [thrnq+8*7+8*3]
215    pmullw    m2, m7
216    movq      [thrq+8*7], m0
217    movq      m4, [thrnq+8*7+8*4]
218    pmullw    m3, m7
219    movq      [thrq+8*7+8], m1
220    movq      m5, [thrnq+8*7+8*5]
221    pmullw    m4, m7
222    movq      [thrq+8*7+8*2], m2
223    movq      m6, [thrnq+8*7+8*6]
224    pmullw    m5, m7
225    movq      [thrq+8*7+8*3], m3
226    movq      m0, [thrnq+14*8]
227    pmullw    m6, m7
228    movq      [thrq+8*7+8*4], m4
229    movq      m1, [thrnq+14*8+8]
230    pmullw    m0, m7
231    movq      [thrq+8*7+8*5], m5
232    pmullw    m1, m7
233    movq      [thrq+8*7+8*6], m6
234    movq      [thrq+14*8], m0
235    movq      [thrq+14*8+8], m1
236    RET
237
238%macro COLUMN_FDCT 1-3 0, 0
239    movq      m1, [srcq+DCTSIZE*0*2]
240    movq      m7, [srcq+DCTSIZE*3*2]
241    movq      m0, m1
242    paddw     m1, [srcq+DCTSIZE*7*2]
243    movq      m3, m7
244    paddw     m7, [srcq+DCTSIZE*4*2]
245    movq      m5, m1
246    movq      m6, [srcq+DCTSIZE*1*2]
247    psubw     m1, m7
248    movq      m2, [srcq+DCTSIZE*2*2]
249    movq      m4, m6
250    paddw     m6, [srcq+DCTSIZE*6*2]
251    paddw     m5, m7
252    paddw     m2, [srcq+DCTSIZE*5*2]
253    movq      m7, m6
254    paddw     m6, m2
255    psubw     m7, m2
256    movq      m2, m5
257    paddw     m5, m6
258    psubw     m2, m6
259    paddw     m7, m1
260    movq      m6, [thrq+4*16+%2]
261    psllw     m7, 2
262    psubw     m5, [thrq+%2]
263    psubw     m2, m6
264    paddusw   m5, [thrq+%2]
265    paddusw   m2, m6
266    pmulhw    m7, [pw_2D41]
267    paddw     m5, [thrq+%2]
268    paddw     m2, m6
269    psubusw   m5, [thrq+%2]
270    psubusw   m2, m6
271    paddw     m5, [pw_2]
272    movq      m6, m2
273    paddw     m2, m5
274    psubw     m5, m6
275    movq      m6, m1
276    paddw     m1, m7
277    psubw     m1, [thrq+2*16+%2]
278    psubw     m6, m7
279    movq      m7, [thrq+6*16+%2]
280    psraw     m5, 2
281    paddusw   m1, [thrq+2*16+%2]
282    psubw     m6, m7
283    paddw     m1, [thrq+2*16+%2]
284    paddusw   m6, m7
285    psubusw   m1, [thrq+2*16+%2]
286    paddw     m6, m7
287    psubw     m3, [srcq+DCTSIZE*4*2]
288    psubusw   m6, m7
289    movq      m7, m1
290    psraw     m2, 2
291    psubw     m4, [srcq+DCTSIZE*6*2]
292    psubw     m1, m6
293    psubw     m0, [srcq+DCTSIZE*7*2]
294    paddw     m6, m7
295    psraw     m6, 2
296    movq      m7, m2
297    pmulhw    m1, [pw_5A82]
298    paddw     m2, m6
299    movq      [rsp], m2
300    psubw     m7, m6
301    movq      m2, [srcq+DCTSIZE*2*2]
302    psubw     m1, m6
303    psubw     m2, [srcq+DCTSIZE*5*2]
304    movq      m6, m5
305    movq      [rsp+8*3], m7
306    paddw     m3, m2
307    paddw     m2, m4
308    paddw     m4, m0
309    movq      m7, m3
310    psubw     m3, m4
311    psllw     m3, 2
312    psllw     m7, 2
313    pmulhw    m3, [pw_187E]
314    psllw     m4, 2
315    pmulhw    m7, [pw_22A3]
316    psllw     m2, 2
317    pmulhw    m4, [pw_539F]
318    paddw     m5, m1
319    pmulhw    m2, [pw_2D41]
320    psubw     m6, m1
321    paddw     m7, m3
322    movq      [rsp+8], m5
323    paddw     m4, m3
324    movq      m3, [thrq+3*16+%2]
325    movq      m1, m0
326    movq      [rsp+8*2], m6
327    psubw     m1, m2
328    paddw     m0, m2
329    movq      m5, m1
330    movq      m2, [thrq+5*16+%2]
331    psubw     m1, m7
332    paddw     m5, m7
333    psubw     m1, m3
334    movq      m7, [thrq+16+%2]
335    psubw     m5, m2
336    movq      m6, m0
337    paddw     m0, m4
338    paddusw   m1, m3
339    psubw     m6, m4
340    movq      m4, [thrq+7*16+%2]
341    psubw     m0, m7
342    psubw     m6, m4
343    paddusw   m5, m2
344    paddusw   m6, m4
345    paddw     m1, m3
346    paddw     m5, m2
347    paddw     m6, m4
348    psubusw   m1, m3
349    psubusw   m5, m2
350    psubusw   m6, m4
351    movq      m4, m1
352    por       m4, m5
353    paddusw   m0, m7
354    por       m4, m6
355    paddw     m0, m7
356    packssdw  m4, m4
357    psubusw   m0, m7
358    movd      tmpd, m4
359    or        tmpd, tmpd
360    jnz %1
361    movq      m4, [rsp]
362    movq      m1, m0
363    pmulhw    m0, [pw_3642]
364    movq      m2, m1
365    movq      m5, [outq+DCTSIZE*0*2]
366    movq      m3, m2
367    pmulhw    m1, [pw_2441]
368    paddw     m5, m4
369    movq      m6, [rsp+8]
370    psraw     m3, 2
371    pmulhw    m2, [pw_0CBB]
372    psubw     m4, m3
373    movq      m7, [outq+DCTSIZE*1*2]
374    paddw     m5, m3
375    movq      [outq+DCTSIZE*7*2], m4
376    paddw     m7, m6
377    movq      m3, [rsp+8*2]
378    psubw     m6, m0
379    movq      m4, [outq+DCTSIZE*2*2]
380    paddw     m7, m0
381    movq      [outq], m5
382    paddw     m4, m3
383    movq      [outq+DCTSIZE*6*2], m6
384    psubw     m3, m1
385    movq      m5, [outq+DCTSIZE*5*2]
386    paddw     m4, m1
387    movq      m6, [outq+DCTSIZE*3*2]
388    paddw     m5, m3
389    movq      m0, [rsp+8*3]
390    add       srcq, 8+%3
391    movq      [outq+DCTSIZE*1*2], m7
392    paddw     m6, m0
393    movq      [outq+DCTSIZE*2*2], m4
394    psubw     m0, m2
395    movq      m7, [outq+DCTSIZE*4*2]
396    paddw     m6, m2
397    movq      [outq+DCTSIZE*5*2], m5
398    paddw     m7, m0
399    movq      [outq+DCTSIZE*3*2], m6
400    movq      [outq+DCTSIZE*4*2], m7
401    add       outq, 8+%3
402%endmacro
403
404%macro COLUMN_IDCT 0-1 0
405    movq      m3, m5
406    psubw     m5, m1
407    psllw     m5, 1
408    paddw     m3, m1
409    movq      m2, m0
410    psubw     m0, m6
411    movq      m1, m5
412    psllw     m0, 1
413    pmulhw    m1, [pw_AC62]
414    paddw     m5, m0
415    pmulhw    m5, [pw_3B21]
416    paddw     m2, m6
417    pmulhw    m0, [pw_22A3]
418    movq      m7, m2
419    movq      m4, [rsp]
420    psubw     m2, m3
421    psllw     m2, 1
422    paddw     m7, m3
423    pmulhw    m2, [pw_2D41]
424    movq      m6, m4
425    psraw     m7, 2
426    paddw     m4, [outq]
427    psubw     m6, m7
428    movq      m3, [rsp+8]
429    paddw     m4, m7
430    movq      [outq+DCTSIZE*7*2], m6
431    paddw     m1, m5
432    movq      [outq], m4
433    psubw     m1, m7
434    movq      m7, [rsp+8*2]
435    psubw     m0, m5
436    movq      m6, [rsp+8*3]
437    movq      m5, m3
438    paddw     m3, [outq+DCTSIZE*1*2]
439    psubw     m5, m1
440    psubw     m2, m1
441    paddw     m3, m1
442    movq      [outq+DCTSIZE*6*2], m5
443    movq      m4, m7
444    paddw     m7, [outq+DCTSIZE*2*2]
445    psubw     m4, m2
446    paddw     m4, [outq+DCTSIZE*5*2]
447    paddw     m7, m2
448    movq      [outq+DCTSIZE*1*2], m3
449    paddw     m0, m2
450    movq      [outq+DCTSIZE*2*2], m7
451    movq      m1, m6
452    paddw     m6, [outq+DCTSIZE*4*2]
453    psubw     m1, m0
454    paddw     m1, [outq+DCTSIZE*3*2]
455    paddw     m6, m0
456    movq      [outq+DCTSIZE*5*2], m4
457    add       srcq, 8+%1
458    movq      [outq+DCTSIZE*4*2], m6
459    movq      [outq+DCTSIZE*3*2], m1
460    add       outq, 8+%1
461%endmacro
462
463;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
464cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
465.fdct1:
466    COLUMN_FDCT .idct1
467    jmp .fdct2
468
469.idct1:
470    COLUMN_IDCT
471
472.fdct2:
473    COLUMN_FDCT .idct2, 8, 16
474    sub    cntd, 2
475    jg .fdct1
476    RET
477
478.idct2:
479    COLUMN_IDCT 16
480    sub    cntd, 2
481    jg .fdct1
482    RET
483
484;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
485cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
486    add       strideq, strideq
487    lea       stride3q, [strideq+strideq*2]
488.loop:
489    movq      m0, [srcq+DCTSIZE*0*2]
490    movq      m1, [srcq+DCTSIZE*1*2]
491    movq      m4, m0
492    movq      m2, [srcq+DCTSIZE*2*2]
493    punpcklwd m0, m1
494    movq      m3, [srcq+DCTSIZE*3*2]
495    punpckhwd m4, m1
496    movq      m7, m2
497    punpcklwd m2, m3
498    movq      m6, m0
499    punpckldq m0, m2
500    punpckhdq m6, m2
501    movq      m5, m0
502    punpckhwd m7, m3
503    psubw     m0, m6
504    pmulhw    m0, [pw_5A82]
505    movq      m2, m4
506    punpckldq m4, m7
507    paddw     m5, m6
508    punpckhdq m2, m7
509    movq      m1, m4
510    psllw     m0, 2
511    paddw     m4, m2
512    movq      m3, [srcq+DCTSIZE*0*2+8]
513    psubw     m1, m2
514    movq      m2, [srcq+DCTSIZE*1*2+8]
515    psubw     m0, m5
516    movq      m6, m4
517    paddw     m4, m5
518    psubw     m6, m5
519    movq      m7, m1
520    movq      m5, [srcq+DCTSIZE*2*2+8]
521    paddw     m1, m0
522    movq      [rsp], m4
523    movq      m4, m3
524    movq      [rsp+8], m6
525    punpcklwd m3, m2
526    movq      m6, [srcq+DCTSIZE*3*2+8]
527    punpckhwd m4, m2
528    movq      m2, m5
529    punpcklwd m5, m6
530    psubw     m7, m0
531    punpckhwd m2, m6
532    movq      m0, m3
533    punpckldq m3, m5
534    punpckhdq m0, m5
535    movq      m5, m4
536    movq      m6, m3
537    punpckldq m4, m2
538    psubw     m3, m0
539    punpckhdq m5, m2
540    paddw     m6, m0
541    movq      m2, m4
542    movq      m0, m3
543    psubw     m4, m5
544    pmulhw    m0, [pw_AC62]
545    paddw     m3, m4
546    pmulhw    m3, [pw_3B21]
547    paddw     m2, m5
548    pmulhw    m4, [pw_22A3]
549    movq      m5, m2
550    psubw     m2, m6
551    paddw     m5, m6
552    pmulhw    m2, [pw_2D41]
553    paddw     m0, m3
554    psllw     m0, 3
555    psubw     m4, m3
556    movq      m6, [rsp]
557    movq      m3, m1
558    psllw     m4, 3
559    psubw     m0, m5
560    psllw     m2, 3
561    paddw     m1, m0
562    psubw     m2, m0
563    psubw     m3, m0
564    paddw     m4, m2
565    movq      m0, m7
566    paddw     m7, m2
567    psubw     m0, m2
568    movq      m2, [pw_4]
569    psubw     m6, m5
570    paddw     m5, [rsp]
571    paddw     m1, m2
572    paddw     m5, m2
573    psraw     m1, 3
574    paddw     m7, m2
575    psraw     m5, 3
576    paddw     m5, [dstq]
577    psraw     m7, 3
578    paddw     m1, [dstq+strideq*1]
579    paddw     m0, m2
580    paddw     m7, [dstq+strideq*2]
581    paddw     m3, m2
582    movq      [dstq], m5
583    paddw     m6, m2
584    movq      [dstq+strideq*1], m1
585    psraw     m0, 3
586    movq      [dstq+strideq*2], m7
587    add       dstq, stride3q
588    movq      m5, [rsp+8]
589    psraw     m3, 3
590    paddw     m0, [dstq+strideq*2]
591    psubw     m5, m4
592    paddw     m3, [dstq+stride3q*1]
593    psraw     m6, 3
594    paddw     m4, [rsp+8]
595    paddw     m5, m2
596    paddw     m6, [dstq+strideq*4]
597    paddw     m4, m2
598    movq      [dstq+strideq*2], m0
599    psraw     m5, 3
600    paddw     m5, [dstq]
601    psraw     m4, 3
602    paddw     m4, [dstq+strideq*1]
603    add       srcq, DCTSIZE*2*4
604    movq      [dstq+stride3q*1], m3
605    movq      [dstq+strideq*4], m6
606    movq      [dstq], m5
607    movq      [dstq+strideq*1], m4
608    sub       dstq, stride3q
609    add       dstq, 8
610    dec       r3d
611    jnz .loop
612    RET
613
614;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
615cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
616    lea       stride3q, [strideq+strideq*2]
617.loop:
618    movd      m0, [pixq]
619    pxor      m7, m7
620    movd      m1, [pixq+strideq*1]
621    punpcklbw m0, m7
622    movd      m2, [pixq+strideq*2]
623    punpcklbw m1, m7
624    punpcklbw m2, m7
625    add       pixq,stride3q
626    movq      m5, m0
627    movd      m3, [pixq+strideq*4]
628    movq      m6, m1
629    movd      m4, [pixq+stride3q*1]
630    punpcklbw m3, m7
631    psubw     m5, m3
632    punpcklbw m4, m7
633    paddw     m0, m3
634    psubw     m6, m4
635    movd      m3, [pixq+strideq*2]
636    paddw     m1, m4
637    movq      [rsp], m5
638    punpcklbw m3, m7
639    movq      [rsp+8], m6
640    movq      m4, m2
641    movd      m5, [pixq]
642    paddw     m2, m3
643    movd      m6, [pixq+strideq*1]
644    punpcklbw m5, m7
645    psubw     m4, m3
646    punpcklbw m6, m7
647    movq      m3, m5
648    paddw     m5, m6
649    psubw     m3, m6
650    movq      m6, m0
651    movq      m7, m1
652    psubw     m0, m5
653    psubw     m1, m2
654    paddw     m7, m2
655    paddw     m1, m0
656    movq      m2, m7
657    psllw     m1, 2
658    paddw     m6, m5
659    pmulhw    m1, [pw_2D41]
660    paddw     m7, m6
661    psubw     m6, m2
662    movq      m5, m0
663    movq      m2, m7
664    punpcklwd m7, m6
665    paddw     m0, m1
666    punpckhwd m2, m6
667    psubw     m5, m1
668    movq      m6, m0
669    movq      m1, [rsp+8]
670    punpcklwd m0, m5
671    punpckhwd m6, m5
672    movq      m5, m0
673    punpckldq m0, m7
674    paddw     m3, m4
675    punpckhdq m5, m7
676    movq      m7, m6
677    movq      [srcq+DCTSIZE*0*2], m0
678    punpckldq m6, m2
679    movq      [srcq+DCTSIZE*1*2], m5
680    punpckhdq m7, m2
681    movq      [srcq+DCTSIZE*2*2], m6
682    paddw     m4, m1
683    movq      [srcq+DCTSIZE*3*2], m7
684    psllw     m3, 2
685    movq      m2, [rsp]
686    psllw     m4, 2
687    pmulhw    m4, [pw_2D41]
688    paddw     m1, m2
689    psllw     m1, 2
690    movq      m0, m3
691    pmulhw    m0, [pw_22A3]
692    psubw     m3, m1
693    pmulhw    m3, [pw_187E]
694    movq      m5, m2
695    pmulhw    m1, [pw_539F]
696    psubw     m2, m4
697    paddw     m5, m4
698    movq      m6, m2
699    paddw     m0, m3
700    movq      m7, m5
701    paddw     m2, m0
702    psubw     m6, m0
703    movq      m4, m2
704    paddw     m1, m3
705    punpcklwd m2, m6
706    paddw     m5, m1
707    punpckhwd m4, m6
708    psubw     m7, m1
709    movq      m6, m5
710    punpcklwd m5, m7
711    punpckhwd m6, m7
712    movq      m7, m2
713    punpckldq m2, m5
714    sub       pixq, stride3q
715    punpckhdq m7, m5
716    movq      m5, m4
717    movq      [srcq+DCTSIZE*0*2+8], m2
718    punpckldq m4, m6
719    movq      [srcq+DCTSIZE*1*2+8], m7
720    punpckhdq m5, m6
721    movq      [srcq+DCTSIZE*2*2+8], m4
722    add       pixq, 4
723    movq      [srcq+DCTSIZE*3*2+8], m5
724    add       srcq, DCTSIZE*4*2
725    dec       cntd
726    jnz .loop
727    RET
728