1############################################################################
2##                           **** WAVPACK ****                            ##
3##                  Hybrid Lossless Wavefile Compressor                   ##
4##              Copyright (c) 1998 - 2015 Conifer Software.               ##
5##                          All Rights Reserved.                          ##
6##      Distributed under the BSD Software License (see license.txt)      ##
7############################################################################
8
9        .intel_syntax noprefix
10        .text
11
12        .globl  _unpack_decorr_stereo_pass_cont_x64win
13        .globl  _unpack_decorr_mono_pass_cont_x64win
14
15        .globl  unpack_decorr_stereo_pass_cont_x64win
16        .globl  unpack_decorr_mono_pass_cont_x64win
17
18        .globl  _unpack_decorr_stereo_pass_cont_x64
19        .globl  _unpack_decorr_mono_pass_cont_x64
20
21        .globl  unpack_decorr_stereo_pass_cont_x64
22        .globl  unpack_decorr_mono_pass_cont_x64
23
24# This is an assembly optimized version of the following WavPack function:
25#
26# void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
27#                                      int32_t *buffer,
28#                                      int32_t sample_count,
29#                                      int32_t long_math;
30#
31# It performs a single pass of stereo decorrelation on the provided buffer.
32# Note that this version of the function requires that up to 8 previous
33# stereo samples are visible and correct. In other words, it ignores the
34# "samples_*" fields in the decorr_pass structure and gets the history data
35# directly from the buffer. It does, however, return the appropriate history
36# samples to the decorr_pass structure before returning.
37#
38# The "long_math" argument is used to specify that a 32-bit multiply is
39# not enough for the "apply_weight" operation (although in this case it
40# would only apply to the -1 and -2 terms because the MMX code does not have
41# this limitation) but we ignore the parameter and use the overflow detection
42# of the "imul" instruction to switch automatically to the "long_math" loop.
43#
44# This is written to work on an X86-64 processor (also called the AMD64)
45# running in 64-bit mode and generally uses the MMX extensions to improve
46# the performance by processing both stereo channels together. Unfortunately
47# this is not easily used for terms -1 and -2, so these terms are handled
48# sequentially with regular assembler code.
49#
50# This version has entry points for both the System V ABI and the Windows
51# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
52# non-volatile registers for both ABIs on the stack and allocates another
53# 8 bytes on the stack to store the dpp pointer. Note that it does NOT
54# provide unwind data for the Windows ABI (the unpack_x64.asm module for
55# MSVC does). The arguments are passed in registers:
56#
57# System V  Windows
58#   rdi       rcx      struct decorr_pass *dpp
59#   rsi       rdx      int32_t *buffer
60#   edx       r8       int32_t sample_count
61#   ecx       r9       int32_t long_math
62#
63# registers after entry:
64#
65#   rdi         bptr
66#   rsi         eptr
67#
68# stack usage:
69#
70# [rsp+0] = *dpp
71#
72
73_unpack_decorr_stereo_pass_cont_x64win:
74unpack_decorr_stereo_pass_cont_x64win:
75        push    rbp
76        push    rbx
77        push    rdi
78        push    rsi
79        sub     rsp, 8
80        mov     rdi, rcx                    # copy params from win regs to Linux regs
81        mov     rsi, rdx                    # so we can leave following code similar
82        mov     rdx, r8
83        mov     rcx, r9
84        jmp     entry                       # jump into common portion
85
86_unpack_decorr_stereo_pass_cont_x64:
87unpack_decorr_stereo_pass_cont_x64:
88        push    rbp
89        push    rbx
90        push    rdi
91        push    rsi
92        sub     rsp, 8
93
94entry:  mov     [rsp], rdi                  # store dpp* at [rsp]
95        and     edx, edx                    # if sample_count is zero, do nothing
96        jz      done
97
98        mov     rdi, rsi                    # rdi = bptr
99        lea     rsi, [rdi+rdx*8]            # rsi = eptr
100
101        mov     rax, [rsp]                  # get term from dpp struct & vector to handler
102        mov     eax, [rax]
103        cmp     al, 17
104        je      term_17_entry
105        cmp     al, 18
106        je      term_18_entry
107        cmp     al, -1
108        je      term_minus_1_entry
109        cmp     al, -2
110        je      term_minus_2_entry
111        cmp     al, -3
112        je      term_minus_3_entry
113
114#
115# registers in default term loop:
116#
117#   rbx         term * -8 (for indexing correlation sample)
118#   rdi         bptr
119#   rsi         eptr
120#
121#   mm0, mm1    scratch
122#   mm2         original sample values
123#   mm3         correlation sample
124#   mm4         zero (for pcmpeqd)
125#   mm5         weights
126#   mm6         delta
127#   mm7         512 (for rounding)
128#
129
130default_term_entry:
131        imul    rbx, rax, -8                # set RBX to term * -8
132        mov     eax, 512
133        movd    mm7, eax
134        punpckldq mm7, mm7                  # mm7 = round (512)
135        mov     rdx, [rsp]                  # set RDX to *dpp
136        mov     eax, [rdx+4]
137        movd    mm6, eax
138        punpckldq mm6, mm6                  # mm6 = delta (0-7)
139        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
140        movd    mm5, eax
141        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
142        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
143        pxor    mm4, mm4                    # mm4 = zero (for pcmpeqd)
144        jmp     default_term_loop
145
146        .balign  64
147default_term_loop:
148        movq    mm3, [rdi+rbx]              # mm3 = sam_AB
149        movq    mm1, mm3
150        movq    mm0, mm3
151        paddd   mm1, mm1
152        psrld   mm0, 15
153        psrlw   mm1, 1
154        pmaddwd mm0, mm5
155        pmaddwd mm1, mm5
156        movq    mm2, [rdi]                  # mm2 = left_right
157        pslld   mm0, 5
158        paddd   mm1, mm7                    # add 512 for rounding
159        psrad   mm1, 10
160        paddd   mm0, mm2
161        paddd   mm0, mm1                    # add shifted sums
162        movq    [rdi], mm0                  # store result
163        movq    mm0, mm3
164        pxor    mm0, mm2
165        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
166        add     rdi, 8
167        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
168        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
169        por     mm2, mm3                    # mm2 = 1s if either was zero
170        pandn   mm2, mm6                    # mask delta with zeros check
171        pxor    mm5, mm0
172        paddw   mm5, mm2                    # and add to weight_AB
173        pxor    mm5, mm0
174        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
175        jb      default_term_loop
176
177        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
178        psrad   mm5, 16
179        mov     rdx, [rsp]                  # point to dpp
180        movq    [rdx+8], mm5                # put weight_AB back
181        emms
182
183        mov     ecx, [rdx]                  # ecx = dpp->term
184
185default_store_samples:
186        dec     ecx
187        sub     rdi, 8                      # back up one full sample
188        mov     eax, [rdi+4]
189        mov     [rdx+rcx*4+48], eax         # store samples_B [ecx]
190        mov     eax, [rdi]
191        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
192        test    ecx, ecx
193        jnz     default_store_samples
194        jmp     done
195
196#
197# registers in term 17 & 18 loops:
198#
199#   rdi         bptr
200#   rsi         eptr
201#
202#   mm0, mm1    scratch
203#   mm2         original sample values
204#   mm3         correlation samples
205#   mm4         last calculated values (so we don't need to reload)
206#   mm5         weights
207#   mm6         delta
208#   mm7         512 (for rounding)
209#
210
211term_17_entry:
212        mov     eax, 512
213        movd    mm7, eax
214        punpckldq mm7, mm7                  # mm7 = round (512)
215        mov     rdx, [rsp]                  # set RDX to *dpp
216        mov     eax, [rdx+4]
217        movd    mm6, eax
218        punpckldq mm6, mm6                  # mm6 = delta (0-7)
219        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
220        movd    mm5, eax
221        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
222        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
223        movq    mm4, [rdi-8]                # preload last calculated values in mm4
224        jmp     term_17_loop
225
226        .balign  64
227term_17_loop:
228        paddd   mm4, mm4
229        psubd   mm4, [rdi-16]               # mm3 = sam_AB
230        movq    mm3, mm4
231        movq    mm1, mm3
232        paddd   mm1, mm1
233        psrld   mm4, 15
234        psrlw   mm1, 1
235        pmaddwd mm4, mm5
236        pmaddwd mm1, mm5
237        movq    mm2, [rdi]                  # mm2 = left_right
238        pslld   mm4, 5
239        paddd   mm1, mm7                    # add 512 for rounding
240        psrad   mm1, 10
241        paddd   mm4, mm2
242        paddd   mm4, mm1                    # add shifted sums
243        movq    mm0, mm3
244        movq    [rdi], mm4                  # store result
245        pxor    mm0, mm2
246        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
247        add     rdi, 8
248        pxor    mm1, mm1                    # mm1 = zero
249        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
250        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
251        por     mm2, mm3                    # mm2 = 1s if either was zero
252        pandn   mm2, mm6                    # mask delta with zeros check
253        pxor    mm5, mm0
254        paddw   mm5, mm2                    # and add to weight_AB
255        pxor    mm5, mm0
256        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
257        jb      term_17_loop
258        jmp     term_1718_exit              # terms 17 & 18 treat samples_AB[] the same
259
260term_18_entry:
261        mov     eax, 512
262        movd    mm7, eax
263        punpckldq mm7, mm7                  # mm7 = round (512)
264        mov     rdx, [rsp]                  # set RDX to *dpp
265        mov     eax, [rdx+4]
266        movd    mm6, eax
267        punpckldq mm6, mm6                  # mm6 = delta (0-7)
268        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
269        movd    mm5, eax
270        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
271        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
272        movq    mm4, [rdi-8]                # preload last calculated values in mm4
273        jmp     term_18_loop
274
275        .balign  64
276term_18_loop:
277        movq    mm3, mm4
278        psubd   mm3, [rdi-16]
279        psrad   mm3, 1
280        paddd   mm3, mm4                    # mm3 = sam_AB
281        movq    mm1, mm3
282        movq    mm4, mm3
283        paddd   mm1, mm1
284        psrld   mm4, 15
285        psrlw   mm1, 1
286        pmaddwd mm4, mm5
287        pmaddwd mm1, mm5
288        movq    mm2, [rdi]                  # mm2 = left_right
289        pslld   mm4, 5
290        paddd   mm1, mm7                    # add 512 for rounding
291        psrad   mm1, 10
292        paddd   mm4, mm2
293        paddd   mm4, mm1                    # add shifted sums
294        movq    mm0, mm3
295        movq    [rdi], mm4                  # store result
296        pxor    mm0, mm2
297        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
298        add     rdi, 8
299        pxor    mm1, mm1                    # mm1 = zero
300        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
301        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
302        por     mm2, mm3                    # mm2 = 1s if either was zero
303        pandn   mm2, mm6                    # mask delta with zeros check
304        pxor    mm5, mm0
305        paddw   mm5, mm2                    # and add to weight_AB
306        pxor    mm5, mm0
307        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
308        jb      term_18_loop
309
310term_1718_exit:
311        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
312        psrad   mm5, 16
313        mov     rdx, [rsp]                  # point to dpp
314        movq    [rdx+8], mm5                # put weight_AB back
315        emms
316
317        mov     eax, [rdi-4]                # dpp->samples_B [0] = bptr [-1];
318        mov     [rdx+48], eax
319        mov     eax, [rdi-8]                # dpp->samples_A [0] = bptr [-2];
320        mov     [rdx+16], eax
321        mov     eax, [rdi-12]               # dpp->samples_B [1] = bptr [-3];
322        mov     [rdx+52], eax
323        mov     eax, [rdi-16]               # dpp->samples_A [1] = bptr [-4];
324        mov     [rdx+20], eax
325        jmp     done
326
327#
328# registers in term -1 & -2 loops:
329#
330#   eax,ebx,edx scratch
331#   ecx         weight_A
332#   ebp         weight_B
333#   rdi         bptr
334#   rsi         eptr
335#   r8d         delta
336#
337
338term_minus_1_entry:
339        cld
340        mov     rdx, [rsp]                  # point to dpp
341        mov     ecx, [rdx+8]                # ecx = weight_A
342        mov     ebp, [rdx+12]               # ebp = weight_B
343        mov     r8d, [rdx+4]                # r8d = delta
344        mov     eax, [rdi-4]
345        jmp     term_minus_1_loop
346
347        .balign  64
348term_minus_1_loop:
349        mov     ebx, eax
350        imul    eax, ecx
351        mov     edx, [rdi]
352        jo      OV11
353        sar     eax, 10
354        adc     eax, edx
355        stosd
356        test    ebx, ebx
357        je      L182
358        test    edx, edx
359        je      L182
360        xor     ebx, edx
361        sar     ebx, 31
362        xor     ecx, ebx
363        add     ecx, r8d
364        mov     edx, 1024
365        add     edx, ebx
366        cmp     ecx, edx
367        jle     L183
368        mov     ecx, edx
369L183:   xor     ecx, ebx
370L182:   mov     ebx, eax
371        imul    eax, ebp
372        mov     edx, [rdi]
373        jo      OV12
374        sar     eax, 10
375        adc     eax, edx
376        stosd
377        test    ebx, ebx
378        je      L187
379        test    edx, edx
380        je      L187
381        xor     ebx, edx
382        sar     ebx, 31
383        xor     ebp, ebx
384        add     ebp, r8d
385        mov     edx, 1024
386        add     edx, ebx
387        cmp     ebp, edx
388        jle     L188
389        mov     ebp, edx
390L188:   xor     ebp, ebx
391L187:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
392        jb      term_minus_1_loop
393        jmp     term_minus_1_done
394
395OV11:   mov     eax, ebx                    # restore previous sample into eax
396        jmp     long_term_minus_1_loop
397
398OV12:   mov     eax, ebx                    # restore previous sample into eax
399        jmp     L282
400
401        .balign  64
402long_term_minus_1_loop:
403        mov     ebx, eax
404        imul    ecx
405        shl     edx, 22
406        shr     eax, 10
407        adc     eax, edx
408        mov     edx, [rdi]
409        add     eax, edx
410        stosd
411        test    ebx, ebx
412        je      L282
413        test    edx, edx
414        je      L282
415        xor     ebx, edx
416        sar     ebx, 31
417        xor     ecx, ebx
418        add     ecx, r8d
419        mov     edx, 1024
420        add     edx, ebx
421        cmp     ecx, edx
422        jle     L283
423        mov     ecx, edx
424L283:   xor     ecx, ebx
425L282:   mov     ebx, eax
426        imul    ebp
427        shl     edx, 22
428        shr     eax, 10
429        adc     eax, edx
430        mov     edx, [rdi]
431        add     eax, edx
432        stosd
433        test    ebx, ebx
434        je      L287
435        test    edx, edx
436        je      L287
437        xor     ebx, edx
438        sar     ebx, 31
439        xor     ebp, ebx
440        add     ebp, r8d
441        mov     edx, 1024
442        add     edx, ebx
443        cmp     ebp, edx
444        jle     L288
445        mov     ebp, edx
446L288:   xor     ebp, ebx
447L287:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
448        jb      long_term_minus_1_loop
449
450term_minus_1_done:
451        mov     rdx, [rsp]                  # point to dpp
452        mov     [rdx+8], ecx                # store weights back
453        mov     [rdx+12], ebp
454        mov     eax, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
455        mov     [rdx+16], eax
456        jmp     done
457
458term_minus_2_entry:
459        mov     rdx, [rsp]                  # point to dpp
460        mov     ecx, [rdx+8]                # ecx = weight_A
461        mov     ebp, [rdx+12]               # ebp = weight_B
462        mov     r8d, [rdx+4]                # r8d = delta
463        mov     eax, [rdi-8]
464        jmp     term_minus_2_loop
465
466        .balign  64
467term_minus_2_loop:
468        mov     ebx, eax
469        imul    eax, ebp
470        mov     edx, [rdi+4]
471        jo      OV21
472        sar     eax, 10
473        adc     eax, edx
474        mov     [rdi+4], eax
475        test    ebx, ebx
476        je      L194
477        test    edx, edx
478        je      L194
479        xor     ebx, edx
480        sar     ebx, 31
481        xor     ebp, ebx
482        add     ebp, r8d
483        mov     edx, 1024
484        add     edx, ebx
485        cmp     ebp, edx
486        jle     L195
487        mov     ebp, edx
488L195:   xor     ebp, ebx
489L194:   mov     ebx, eax
490        imul    eax, ecx
491        mov     edx, [rdi]
492        jo      OV22
493        sar     eax, 10
494        adc     eax, edx
495        mov     [rdi], eax
496        test    ebx, ebx
497        je      L199
498        test    edx, edx
499        je      L199
500        xor     ebx, edx
501        sar     ebx, 31
502        xor     ecx, ebx
503        add     ecx, r8d
504        mov     edx, 1024
505        add     edx, ebx
506        cmp     ecx, edx
507        jle     L200
508        mov     ecx, edx
509L200:   xor     ecx, ebx
510L199:   add     rdi, 8
511        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
512        jb      term_minus_2_loop
513        jmp     term_minus_2_done
514
515OV21:   mov     eax, ebx                    # restore previous sample into eax
516        jmp     long_term_minus_2_loop
517
518OV22:   mov     eax, ebx                    # restore previous sample into eax
519        jmp     L294
520
521        .balign  64
522long_term_minus_2_loop:
523        mov     ebx, eax
524        imul    ebp
525        shl     edx, 22
526        shr     eax, 10
527        adc     eax, edx
528        mov     edx, [rdi+4]
529        add     eax, edx
530        mov     [rdi+4], eax
531        test    ebx, ebx
532        je      L294
533        test    edx, edx
534        je      L294
535        xor     ebx, edx
536        sar     ebx, 31
537        xor     ebp, ebx
538        add     ebp, r8d
539        mov     edx, 1024
540        add     edx, ebx
541        cmp     ebp, edx
542        jle     L295
543        mov     ebp, edx
544L295:   xor     ebp, ebx
545L294:   mov     ebx, eax
546        imul    ecx
547        shl     edx, 22
548        shr     eax, 10
549        adc     eax, edx
550        mov     edx, [rdi]
551        add     eax, edx
552        mov     [rdi], eax
553        test    ebx, ebx
554        je      L299
555        test    edx, edx
556        je      L299
557        xor     ebx, edx
558        sar     ebx, 31
559        xor     ecx, ebx
560        add     ecx, r8d
561        mov     edx, 1024
562        add     edx, ebx
563        cmp     ecx, edx
564        jle     L300
565        mov     ecx, edx
566L300:   xor     ecx, ebx
567L299:   add     rdi, 8
568        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
569        jb      long_term_minus_2_loop
570
571term_minus_2_done:
572        mov     rdx, [rsp]                  # point to dpp
573        mov     [rdx+8], ecx                # store weights back
574        mov     [rdx+12], ebp
575        mov     eax, [rdi-8]                # dpp->samples_B [0] = bptr [-2];
576        mov     [rdx+48], eax
577        jmp     done
578
579#
580# registers in term -3 loop:
581#
582#   rdi         bptr
583#   rsi         eptr
584#
585#   mm0, mm1    scratch
586#   mm2         original sample values
587#   mm3         correlation samples
588#   mm4         last calculated values (so we don't need to reload)
589#   mm5         weights
590#   mm6         delta
591#   mm7         512 (for rounding)
592#
593
594term_minus_3_entry:
595        mov     eax, 512
596        movd    mm7, eax
597        punpckldq mm7, mm7                  # mm7 = round (512)
598        mov     rdx, [rsp]                  # set RDX to *dpp
599        mov     eax, [rdx+4]
600        movd    mm6, eax
601        punpckldq mm6, mm6                  # mm6 = delta (0-7)
602        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
603        movd    mm5, eax
604        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
605        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
606        movq    mm4, [rdi-8]
607        jmp     term_minus_3_loop
608
609        .balign  64
610term_minus_3_loop:
611        movq    mm3, mm4
612        psrlq   mm3, 32
613        punpckldq mm3, mm4                  # mm3 = sam_AB
614        movq    mm1, mm3
615        movq    mm4, mm3
616        pslld   mm1, 1
617        psrld   mm4, 15
618        psrlw   mm1, 1
619        pmaddwd mm4, mm5
620        pmaddwd mm1, mm5
621        movq    mm2, [rdi]                  # mm2 = left_right
622        pslld   mm4, 5
623        paddd   mm1, mm7                    # add 512 for rounding
624        psrad   mm1, 10
625        paddd   mm4, mm2
626        paddd   mm4, mm1                    # add shifted sums
627        movq    [rdi], mm4                  # store result
628        movq    mm0, mm3
629        pxor    mm0, mm2
630        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
631        add     rdi, 8
632        pxor    mm1, mm1                    # mm1 = zero
633        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
634        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
635        por     mm2, mm3                    # mm2 = 1s if either was zero
636        pandn   mm2, mm6                    # mask delta with zeros check
637        pcmpeqd mm1, mm1
638        psubd   mm1, mm7
639        psubd   mm1, mm7
640        psubd   mm1, mm0
641        pxor    mm5, mm0
642        paddw   mm5, mm1
643        paddusw mm5, mm2                    # and add to weight_AB
644        psubw   mm5, mm1
645        pxor    mm5, mm0
646        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
647        jb      term_minus_3_loop
648
649        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
650        psrad   mm5, 16
651        mov     rdx, [rsp]                  # point to dpp
652        movq    [rdx+8], mm5                # put weight_AB back
653        emms
654
655        mov     edx, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
656        mov     rax, [rsp]
657        mov     [rax+16], edx
658        mov     edx, [rdi-8]                # dpp->samples_B [0] = bptr [-2];
659        mov     [rax+48], edx
660
661done:   add     rsp, 8
662        pop     rsi
663        pop     rdi
664        pop     rbx
665        pop     rbp
666        ret
667
668#######################################################################################################################
669#
670# This is the mono version of the above function. It does not use MMX and does not handle negative terms.
671#
672# void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
673#                                    int32_t *buffer,
674#                                    int32_t sample_count,
675#                                    int32_t long_math;
676# arguments on entry:
677#
678# System V  Windows
679#   rdi       rcx      struct decorr_pass *dpp
680#   rsi       rdx      int32_t *buffer
681#   edx       r8       int32_t sample_count
682#   ecx       r9       int32_t long_math
683#
684# registers after entry:
685#
686#   rdi         bptr
687#   rsi         eptr
688#
689# stack usage:
690#
691# [rsp+0] = *dpp
692#
693
694_unpack_decorr_mono_pass_cont_x64win:
695unpack_decorr_mono_pass_cont_x64win:
696        push    rbp
697        push    rbx
698        push    rdi
699        push    rsi
700        sub     rsp, 8
701
702        mov     rdi, rcx                    # copy params from win regs to Linux regs
703        mov     rsi, rdx                    # so we can leave following code similar
704        mov     rdx, r8
705        mov     rcx, r9
706        jmp     mentry                      # jump into common portion
707
708_unpack_decorr_mono_pass_cont_x64:
709unpack_decorr_mono_pass_cont_x64:
710        push    rbp
711        push    rbx
712        push    rdi
713        push    rsi
714        sub     rsp, 8
715
716mentry: mov     [rsp], rdi                  # store dpp* into [rsp]
717        and     edx, edx                    # if sample_count is zero, do nothing
718        jz      mono_done
719
720        cld                                 # we use stosd
721        mov     rdi, rsi                    # rdi = bptr
722        lea     rsi, [rdi+rdx*4]            # rsi = eptr
723
724        mov     rax, [rsp]                  # get term from dpp struct & vector to handler
725        mov     eax, [rax]
726        cmp     al, 17
727        je      mono_17_entry
728        cmp     al, 18
729        je      mono_18_entry
730
731#
732# registers during default term processing loop:
733#   rdi         active buffer pointer
734#   rsi         end of buffer pointer
735#   r8d         delta
736#   ecx         weight_A
737#   ebx         term * -4
738#   eax,edx     scratch
739#
740
741default_mono_entry:
742        imul    rbx, rax, -4                # set rbx to term * -4 for decorrelation index
743        mov     rdx, [rsp]
744        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
745        mov     r8d, [rdx+4]
746        jmp     default_mono_loop
747
748#
749# registers during processing loop for terms 17 & 18:
750#   rdi         active buffer pointer
751#   rsi         end of buffer pointer
752#   r8d         delta
753#   ecx         weight_A
754#   ebp         previously calculated value
755#   ebx         calculated correlation sample
756#   eax,edx     scratch
757#
758
759mono_17_entry:
760        mov     rdx, [rsp]                  # rdx = dpp*
761        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
762        mov     r8d, [rdx+4]
763        mov     ebp, [rdi-4]
764        jmp     mono_17_loop
765
766mono_18_entry:
767        mov     rdx, [rsp]                  # rdx = dpp*
768        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
769        mov     r8d, [rdx+4]
770        mov     ebp, [rdi-4]
771        jmp     mono_18_loop
772
773        .balign  64
774default_mono_loop:
775        mov     eax, [rdi+rbx]
776        imul    eax, ecx
777        mov     edx, [rdi]
778        jo      long_default_mono_loop
779        sar     eax, 10
780        adc     eax, edx
781        mov     [rdi], eax
782        mov     eax, [rdi+rbx]
783        add     rdi, 4
784        test    edx, edx
785        je      L100
786        test    eax, eax
787        je      L100
788        xor     eax, edx
789        cdq
790        xor     ecx, edx
791        add     ecx, r8d
792        xor     ecx, edx
793L100:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
794        jb      default_mono_loop
795        jmp     default_mono_done
796
797        .balign  64
798long_default_mono_loop:
799        mov     eax, [rdi+rbx]
800        imul    ecx
801        shl     edx, 22
802        shr     eax, 10
803        adc     eax, edx
804        mov     edx, [rdi]
805        add     eax, edx
806        mov     [rdi], eax
807        mov     eax, [rdi+rbx]
808        add     rdi, 4
809        test    edx, edx
810        je      L101
811        test    eax, eax
812        je      L101
813        xor     eax, edx
814        cdq
815        xor     ecx, edx
816        add     ecx, r8d
817        xor     ecx, edx
818L101:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
819        jb      long_default_mono_loop
820
821default_mono_done:
822        mov     rdx, [rsp]                  # edx = dpp*
823        mov     [rdx+8], ecx                # store weight_A back
824        mov     ecx, [rdx]                  # ecx = dpp->term
825
826default_mono_store_samples:
827        dec     ecx
828        sub     rdi, 4                      # back up one full sample
829        mov     eax, [rdi]
830        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
831        test    ecx, ecx
832        jnz     default_mono_store_samples
833        jmp     mono_done
834
835        .balign  64
836mono_17_loop:
837        lea     ebx, [ebp+ebp]
838        sub     ebx, [rdi-8]
839        mov     eax, ecx
840        imul    eax, ebx
841        mov     edx, [rdi]
842        jo      long_mono_17_loop
843        sar     eax, 10
844        adc     eax, edx
845        stosd
846        test    ebx, ebx
847        mov     ebp, eax
848        je      L117
849        test    edx, edx
850        je      L117
851        xor     ebx, edx
852        sar     ebx, 31
853        xor     ecx, ebx
854        add     ecx, r8d
855        xor     ecx, ebx
856L117:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
857        jb      mono_17_loop
858        jmp     mono_1718_exit
859
860        .balign  64
861long_mono_17_loop:
862        lea     ebx, [ebp+ebp]
863        sub     ebx, [rdi-8]
864        mov     eax, ecx
865        imul    ebx
866        shl     edx, 22
867        shr     eax, 10
868        adc     eax, edx
869        mov     edx, [rdi]
870        add     eax, edx
871        stosd
872        test    ebx, ebx
873        mov     ebp, eax
874        je      L217
875        test    edx, edx
876        je      L217
877        xor     ebx, edx
878        sar     ebx, 31
879        xor     ecx, ebx
880        add     ecx, r8d
881        xor     ecx, ebx
882L217:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
883        jb      long_mono_17_loop
884        jmp     mono_1718_exit
885
886        .balign  64
887mono_18_loop:
888        lea     ebx, [ebp+ebp*2]
889        sub     ebx, [rdi-8]
890        sar     ebx, 1
891        mov     eax, ecx
892        imul    eax, ebx
893        mov     edx, [rdi]
894        jo      long_mono_18_loop
895        sar     eax, 10
896        adc     eax, edx
897        stosd
898        test    ebx, ebx
899        mov     ebp, eax
900        je      L118
901        test    edx, edx
902        je      L118
903        xor     ebx, edx
904        sar     ebx, 31
905        xor     ecx, ebx
906        add     ecx, r8d
907        xor     ecx, ebx
908L118:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
909        jb      mono_18_loop
910        jmp     mono_1718_exit
911
912        .balign  64
913long_mono_18_loop:
914        lea     ebx, [ebp+ebp*2]
915        sub     ebx, [rdi-8]
916        sar     ebx, 1
917        mov     eax, ecx
918        imul    ebx
919        shl     edx, 22
920        shr     eax, 10
921        adc     eax, edx
922        mov     edx, [rdi]
923        add     eax, edx
924        stosd
925        test    ebx, ebx
926        mov     ebp, eax
927        je      L218
928        test    edx, edx
929        je      L218
930        xor     ebx, edx
931        sar     ebx, 31
932        xor     ecx, ebx
933        add     ecx, r8d
934        xor     ecx, ebx
935L218:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
936        jb      long_mono_18_loop
937
938mono_1718_exit:
939        mov     rdx, [rsp]                  # edx = dpp*
940        mov     [rdx+8], ecx                # store weight_A back
941        mov     eax, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
942        mov     [rdx+16], eax
943        mov     eax, [rdi-8]                # dpp->samples_A [1] = bptr [-2];
944        mov     [rdx+20], eax
945
946mono_done:
947        add     rsp, 8
948        pop     rsi
949        pop     rdi
950        pop     rbx
951        pop     rbp
952        ret
953
954#ifdef __ELF__
955        .section .note.GNU-stack,"",@progbits
956#endif
957
958#ifdef __midipix__
959	.section .got$unpack_decorr_mono_pass_cont_x64win,"r"
960	.global __imp_unpack_decorr_mono_pass_cont_x64win
961__imp_unpack_decorr_mono_pass_cont_x64win:
962	.quad	unpack_decorr_mono_pass_cont_x64win
963	.linkonce discard
964
965	.section .got$unpack_decorr_stereo_pass_cont_x64win,"r"
966	.global __imp_unpack_decorr_stereo_pass_cont_x64win
967__imp_unpack_decorr_stereo_pass_cont_x64win:
968	.quad	unpack_decorr_stereo_pass_cont_x64win
969	.linkonce discard
970#endif
971