1;
2; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jsimdext.inc"
18%include "jdct.inc"
19
20; --------------------------------------------------------------------------
21    SECTION     SEG_TEXT
22    BITS        32
23;
24; Load data into workspace, applying unsigned->signed conversion
25;
26; GLOBAL(void)
27; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
28;                           FAST_FLOAT *workspace);
29;
30
31%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
32%define start_col    ebp + 12           ; JDIMENSION start_col
33%define workspace    ebp + 16           ; FAST_FLOAT *workspace
34
35    align       32
36    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
37
38EXTN(jsimd_convsamp_float_sse2):
39    push        ebp
40    mov         ebp, esp
41    push        ebx
42;   push        ecx                     ; need not be preserved
43;   push        edx                     ; need not be preserved
44    push        esi
45    push        edi
46
47    pcmpeqw     xmm7, xmm7
48    psllw       xmm7, 7
49    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
50
51    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
52    mov         eax, JDIMENSION [start_col]
53    mov         edi, POINTER [workspace]       ; (DCTELEM *)
54    mov         ecx, DCTSIZE/2
55    alignx      16, 7
56.convloop:
57    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
59
60    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
61    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
62
63    psubb       xmm0, xmm7              ; xmm0=(01234567)
64    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
65
66    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
67    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
68
69    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
70    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
71    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
72    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
73
74    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
75    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
76    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
77    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
78    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
79    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
80    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
81    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
82
83    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
84    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
85    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
86    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
87
88    add         esi, byte 2*SIZEOF_JSAMPROW
89    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
90    dec         ecx
91    jnz         short .convloop
92
93    pop         edi
94    pop         esi
95;   pop         edx                     ; need not be preserved
96;   pop         ecx                     ; need not be preserved
97    pop         ebx
98    pop         ebp
99    ret
100
101; --------------------------------------------------------------------------
102;
103; Quantize/descale the coefficients, and store into coef_block
104;
105; GLOBAL(void)
106; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
107;                           FAST_FLOAT *workspace);
108;
109
110%define coef_block  ebp + 8             ; JCOEFPTR coef_block
111%define divisors    ebp + 12            ; FAST_FLOAT *divisors
112%define workspace   ebp + 16            ; FAST_FLOAT *workspace
113
114    align       32
115    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
116
117EXTN(jsimd_quantize_float_sse2):
118    push        ebp
119    mov         ebp, esp
120;   push        ebx                     ; unused
121;   push        ecx                     ; unused
122;   push        edx                     ; need not be preserved
123    push        esi
124    push        edi
125
126    mov         esi, POINTER [workspace]
127    mov         edx, POINTER [divisors]
128    mov         edi, JCOEFPTR [coef_block]
129    mov         eax, DCTSIZE2/16
130    alignx      16, 7
131.quantloop:
132    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
133    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
134    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
135    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
136    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
137    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
138    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
139    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
140
141    cvtps2dq    xmm0, xmm0
142    cvtps2dq    xmm1, xmm1
143    cvtps2dq    xmm2, xmm2
144    cvtps2dq    xmm3, xmm3
145
146    packssdw    xmm0, xmm1
147    packssdw    xmm2, xmm3
148
149    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
150    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
151
152    add         esi, byte 16*SIZEOF_FAST_FLOAT
153    add         edx, byte 16*SIZEOF_FAST_FLOAT
154    add         edi, byte 16*SIZEOF_JCOEF
155    dec         eax
156    jnz         short .quantloop
157
158    pop         edi
159    pop         esi
160;   pop         edx                     ; need not be preserved
161;   pop         ecx                     ; unused
162;   pop         ebx                     ; unused
163    pop         ebp
164    ret
165
166; For some reason, the OS X linker does not honor the request to align the
167; segment unless we do this.
168    align       32
169