1;;
2;; Copyright (c) 2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "imb_job.asm"
30%include "include/memcpy.asm"
31%include "include/clear_regs.asm"
32
33section .data
34default rel
35
36align 16
37constants0:
38dd      0x61707865, 0x61707865, 0x61707865, 0x61707865
39
40align 16
41constants1:
42dd      0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e
43
44align 16
45constants2:
46dd      0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32
47
48align 16
49constants3:
50dd      0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574
51
52align 16
53constants:
54dd      0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
55
56align 16
57dword_1:
58dd      0x00000001, 0x00000000, 0x00000000, 0x00000000
59
60align 16
61dword_2:
62dd      0x00000002, 0x00000000, 0x00000000, 0x00000000
63
64align 16
65dword_1_4:
66dd      0x00000001, 0x00000002, 0x00000003, 0x00000004
67
68align 16
69dword_4:
70dd      0x00000004, 0x00000004, 0x00000004, 0x00000004
71
72align 16
73shuf_mask_rotl8:
74db      3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
75
76align 16
77shuf_mask_rotl16:
78db      2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
79
80align 16
81poly_clamp_r:
82dq      0x0ffffffc0fffffff, 0x0ffffffc0ffffffc
83
84struc STACK
85_STATE:         reso    16      ; Space to store first 4 states
86_XMM_SAVE:      reso    2       ; Space to store up to 2 temporary XMM registers
87_RSP_SAVE:      resq    1       ; Space to store rsp pointer
88endstruc
89%define STACK_SIZE STACK_size
90
91%ifdef LINUX
92%define arg1    rdi
93%define arg2    rsi
94%else
95%define arg1    rcx
96%define arg2    rdx
97%endif
98
99%define job     arg1
100
101%define APPEND(a,b) a %+ b
102
103section .text
104
105;; 4x4 32-bit transpose function
106%macro TRANSPOSE4_U32 6
107%define %%r0 %1 ;; [in/out] Input first row / output third column
108%define %%r1 %2 ;; [in/out] Input second row / output second column
109%define %%r2 %3 ;; [in/clobbered] Input third row
110%define %%r3 %4 ;; [in/out] Input fourth row / output fourth column
111%define %%t0 %5 ;; [out] Temporary XMM register / output first column
112%define %%t1 %6 ;; [clobbered] Temporary XMM register
113
114        movdqa  %%t0, %%r0
115        shufps	%%t0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
116        shufps	%%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
117        movdqa  %%t1, %%r2
118        shufps  %%t1, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
119        shufps	%%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
120
121        movdqa  %%r1, %%t0
122        shufps	%%r1, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
123        movdqa  %%r3, %%r0
124        shufps	%%r3, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
125        shufps	%%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
126        shufps	%%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
127%endmacro
128
129; Rotate dwords on a XMM registers to the left N_BITS
130%macro PROLD 3
131%define %%XMM_IN %1 ; [in/out] XMM register to be rotated
132%define %%N_BITS %2 ; [immediate] Number of bits to rotate
133%define %%XTMP   %3 ; [clobbered] XMM temporary register
134%if %%N_BITS == 8
135        pshufb  %%XMM_IN, [rel shuf_mask_rotl8]
136%elif %%N_BITS == 16
137        pshufb  %%XMM_IN, [rel shuf_mask_rotl16]
138%else
139        movdqa  %%XTMP, %%XMM_IN
140        psrld   %%XTMP, (32-%%N_BITS)
141        pslld   %%XMM_IN, %%N_BITS
142        por     %%XMM_IN, %%XTMP
143%endif
144%endmacro
145
146;;
147;; Performs a quarter round on all 4 columns,
148;; resulting in a full round
149;;
150%macro quarter_round 5
151%define %%A    %1 ;; [in/out] XMM register containing value A of all 4 columns
152%define %%B    %2 ;; [in/out] XMM register containing value B of all 4 columns
153%define %%C    %3 ;; [in/out] XMM register containing value C of all 4 columns
154%define %%D    %4 ;; [in/out] XMM register containing value D of all 4 columns
155%define %%XTMP %5 ;; [clobbered] Temporary XMM register
156
157        paddd   %%A, %%B
158        pxor    %%D, %%A
159        PROLD   %%D, 16, %%XTMP
160        paddd   %%C, %%D
161        pxor    %%B, %%C
162        PROLD   %%B, 12, %%XTMP
163        paddd   %%A, %%B
164        pxor    %%D, %%A
165        PROLD   %%D, 8, %%XTMP
166        paddd   %%C, %%D
167        pxor    %%B, %%C
168        PROLD   %%B, 7, %%XTMP
169
170%endmacro
171
172%macro quarter_round_x2 9
173%define %%A_L    %1 ;; [in/out] XMM register containing value A of all 4 columns
174%define %%B_L    %2 ;; [in/out] XMM register containing value B of all 4 columns
175%define %%C_L    %3 ;; [in/out] XMM register containing value C of all 4 columns
176%define %%D_L    %4 ;; [in/out] XMM register containing value D of all 4 columns
177%define %%A_H    %5 ;; [in/out] XMM register containing value A of all 4 columns
178%define %%B_H    %6 ;; [in/out] XMM register containing value B of all 4 columns
179%define %%C_H    %7 ;; [in/out] XMM register containing value C of all 4 columns
180%define %%D_H    %8 ;; [in/out] XMM register containing value D of all 4 columns
181%define %%XTMP   %9 ;; [clobbered] Temporary XMM register
182
183        paddd   %%A_L, %%B_L
184        paddd   %%A_H, %%B_H
185        pxor    %%D_L, %%A_L
186        pxor    %%D_H, %%A_H
187        PROLD   %%D_L, 16, %%XTMP
188        PROLD   %%D_H, 16, %%XTMP
189        paddd   %%C_L, %%D_L
190        paddd   %%C_H, %%D_H
191        pxor    %%B_L, %%C_L
192        pxor    %%B_H, %%C_H
193        PROLD   %%B_L, 12, %%XTMP
194        PROLD   %%B_H, 12, %%XTMP
195        paddd   %%A_L, %%B_L
196        paddd   %%A_H, %%B_H
197        pxor    %%D_L, %%A_L
198        pxor    %%D_H, %%A_H
199        PROLD   %%D_L, 8, %%XTMP
200        PROLD   %%D_H, 8, %%XTMP
201        paddd   %%C_L, %%D_L
202        paddd   %%C_H, %%D_H
203        pxor    %%B_L, %%C_L
204        pxor    %%B_H, %%C_H
205        PROLD   %%B_L, 7, %%XTMP
206        PROLD   %%B_H, 7, %%XTMP
207
208%endmacro
209
210;;
211;; Rotates the registers to prepare the data
212;; from column round to diagonal round
213;;
214%macro column_to_diag 3
215%define %%B %1 ;; [in/out] XMM register containing value B of all 4 columns
216%define %%C %2 ;; [in/out] XMM register containing value C of all 4 columns
217%define %%D %3 ;; [in/out] XMM register containing value D of all 4 columns
218
219        pshufd  %%B, %%B, 0x39 ; 0b00111001 ;; 0,3,2,1
220        pshufd  %%C, %%C, 0x4E ; 0b01001110 ;; 1,0,3,2
221        pshufd  %%D, %%D, 0x93 ; 0b10010011 ;; 2,1,0,3
222
223%endmacro
224
225;;
226;; Rotates the registers to prepare the data
227;; from diagonal round to column round
228;;
229%macro diag_to_column 3
230%define %%B %1 ;; [in/out] XMM register containing value B of all 4 columns
231%define %%C %2 ;; [in/out] XMM register containing value C of all 4 columns
232%define %%D %3 ;; [in/out] XMM register containing value D of all 4 columns
233
234        pshufd  %%B, %%B, 0x93 ; 0b10010011 ; 2,1,0,3
235        pshufd  %%C, %%C, 0x4E ; 0b01001110 ; 1,0,3,2
236        pshufd  %%D, %%D, 0x39 ; 0b00111001 ; 0,3,2,1
237
238%endmacro
239
240;;
241;; Generates 64 or 128 bytes of keystream
242;; States IN A-C are the same for first 64 and last 64 bytes
243;; State IN D differ because of the different block count
244;;
245%macro GENERATE_64_128_KS 9-14
246%define %%STATE_IN_A      %1  ;; [in] XMM containing state A
247%define %%STATE_IN_B      %2  ;; [in] XMM containing state B
248%define %%STATE_IN_C      %3  ;; [in] XMM containing state C
249%define %%STATE_IN_D_L    %4  ;; [in] XMM containing state D (low block count)
250%define %%A_L_KS0         %5  ;; [out] XMM to contain keystream 0-15 bytes
251%define %%B_L_KS1         %6  ;; [out] XMM to contain keystream 16-31 bytes
252%define %%C_L_KS2         %7  ;; [out] XMM to contain keystream 32-47 bytes
253%define %%D_L_KS3         %8  ;; [out] XMM to contain keystream 48-63 bytes
254%define %%XTMP            %9  ;; [clobbered] Temporary XMM register
255%define %%STATE_IN_D_H    %10  ;; [in] XMM containing state D (high block count)
256%define %%A_H_KS4         %11  ;; [out] XMM to contain keystream 64-79 bytes
257%define %%B_H_KS5         %12  ;; [out] XMM to contain keystream 80-95 bytes
258%define %%C_H_KS6         %13  ;; [out] XMM to contain keystream 96-111 bytes
259%define %%D_H_KS7         %14  ;; [out] XMM to contain keystream 112-127 bytes
260
261        movdqa  %%A_L_KS0, %%STATE_IN_A
262        movdqa  %%B_L_KS1, %%STATE_IN_B
263        movdqa  %%C_L_KS2, %%STATE_IN_C
264        movdqa  %%D_L_KS3, %%STATE_IN_D_L
265%if %0 == 14
266        movdqa  %%A_H_KS4, %%STATE_IN_A
267        movdqa  %%B_H_KS5, %%STATE_IN_B
268        movdqa  %%C_H_KS6, %%STATE_IN_C
269        movdqa  %%D_H_KS7, %%STATE_IN_D_H
270%endif
271%rep 10
272%if %0 == 14
273        quarter_round_x2 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \
274                %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, %%XTMP
275        column_to_diag %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
276        column_to_diag %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
277        quarter_round_x2 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \
278                %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, %%XTMP
279        diag_to_column %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
280        diag_to_column %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
281%else
282        quarter_round %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, %%XTMP
283        column_to_diag %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
284        quarter_round %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, %%XTMP
285        diag_to_column %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
286%endif
287%endrep
288
289        paddd   %%A_L_KS0, %%STATE_IN_A
290        paddd   %%B_L_KS1, %%STATE_IN_B
291        paddd   %%C_L_KS2, %%STATE_IN_C
292        paddd   %%D_L_KS3, %%STATE_IN_D_L
293%if %0 == 14
294        paddd   %%A_H_KS4, %%STATE_IN_A
295        paddd   %%B_H_KS5, %%STATE_IN_B
296        paddd   %%C_H_KS6, %%STATE_IN_C
297        paddd   %%D_H_KS7, %%STATE_IN_D_H
298%endif
299%endmacro
300
301; Perform 4 times the operation in first parameter
302%macro XMM_OP_X4 9
303%define %%OP         %1 ; [immediate] Instruction
304%define %%DST_SRC1_1 %2 ; [in/out] First source/Destination 1
305%define %%DST_SRC1_2 %3 ; [in/out] First source/Destination 2
306%define %%DST_SRC1_3 %4 ; [in/out] First source/Destination 3
307%define %%DST_SRC1_4 %5 ; [in/out] First source/Destination 4
308%define %%SRC2_1     %6 ; [in] Second source 1
309%define %%SRC2_2     %7 ; [in] Second source 2
310%define %%SRC2_3     %8 ; [in] Second source 3
311%define %%SRC2_4     %9 ; [in] Second source 4
312
313        %%OP %%DST_SRC1_1, %%SRC2_1
314        %%OP %%DST_SRC1_2, %%SRC2_2
315        %%OP %%DST_SRC1_3, %%SRC2_3
316        %%OP %%DST_SRC1_4, %%SRC2_4
317%endmacro
318
319%macro XMM_ROLS_X4  6
320%define %%XMM_OP1_1      %1
321%define %%XMM_OP1_2      %2
322%define %%XMM_OP1_3      %3
323%define %%XMM_OP1_4      %4
324%define %%BITS_TO_ROTATE %5
325%define %%XTMP           %6
326
327        ; Store temporary register when bits to rotate is not 8 and 16,
328        ; as the register will be clobbered in these cases,
329        ; containing needed information
330%if %%BITS_TO_ROTATE != 8 && %%BITS_TO_ROTATE != 16
331        movdqa  [rsp + _XMM_SAVE], %%XTMP
332%endif
333        PROLD   %%XMM_OP1_1, %%BITS_TO_ROTATE, %%XTMP
334        PROLD   %%XMM_OP1_2, %%BITS_TO_ROTATE, %%XTMP
335        PROLD   %%XMM_OP1_3, %%BITS_TO_ROTATE, %%XTMP
336        PROLD   %%XMM_OP1_4, %%BITS_TO_ROTATE, %%XTMP
337%if %%BITS_TO_ROTATE != 8 && %%BITS_TO_ROTATE != 16
338        movdqa  %%XTMP, [rsp + _XMM_SAVE]
339%endif
340%endmacro
341
342;;
343;; Performs a full chacha20 round on 4 states,
344;; consisting of 4 quarter rounds, which are done in parallel
345;;
346%macro CHACHA20_ROUND 16
347%define %%XMM_DWORD_A1  %1  ;; [in/out] XMM register containing dword A for first quarter round
348%define %%XMM_DWORD_A2  %2  ;; [in/out] XMM register containing dword A for second quarter round
349%define %%XMM_DWORD_A3  %3  ;; [in/out] XMM register containing dword A for third quarter round
350%define %%XMM_DWORD_A4  %4  ;; [in/out] XMM register containing dword A for fourth quarter round
351%define %%XMM_DWORD_B1  %5  ;; [in/out] XMM register containing dword B for first quarter round
352%define %%XMM_DWORD_B2  %6  ;; [in/out] XMM register containing dword B for second quarter round
353%define %%XMM_DWORD_B3  %7  ;; [in/out] XMM register containing dword B for third quarter round
354%define %%XMM_DWORD_B4  %8  ;; [in/out] XMM register containing dword B for fourth quarter round
355%define %%XMM_DWORD_C1  %9  ;; [in/out] XMM register containing dword C for first quarter round
356%define %%XMM_DWORD_C2 %10  ;; [in/out] XMM register containing dword C for second quarter round
357%define %%XMM_DWORD_C3 %11  ;; [in/out] XMM register containing dword C for third quarter round
358%define %%XMM_DWORD_C4 %12  ;; [in/out] XMM register containing dword C for fourth quarter round
359%define %%XMM_DWORD_D1 %13  ;; [in/out] XMM register containing dword D for first quarter round
360%define %%XMM_DWORD_D2 %14  ;; [in/out] XMM register containing dword D for second quarter round
361%define %%XMM_DWORD_D3 %15  ;; [in/out] XMM register containing dword D for third quarter round
362%define %%XMM_DWORD_D4 %16  ;; [in/out] XMM register containing dword D for fourth quarter round
363
364        ; A += B
365        XMM_OP_X4 paddd, %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4, \
366                         %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4
367        ; D ^= A
368        XMM_OP_X4 pxor, %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, \
369                        %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4
370
371        ; D <<< 16
372        XMM_ROLS_X4 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, 16, \
373                    %%XMM_DWORD_B1
374
375        ; C += D
376        XMM_OP_X4 paddd, %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4, \
377                         %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4
378        ; B ^= C
379        XMM_OP_X4 pxor, %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, \
380                        %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4
381
382        ; B <<< 12
383        XMM_ROLS_X4 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, 12, \
384                    %%XMM_DWORD_D1
385
386        ; A += B
387        XMM_OP_X4 paddd, %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4, \
388                          %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4
389        ; D ^= A
390        XMM_OP_X4 pxor, %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, \
391                          %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4
392
393        ; D <<< 8
394        XMM_ROLS_X4 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, 8, \
395                    %%XMM_DWORD_B1
396
397        ; C += D
398        XMM_OP_X4 paddd, %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4, \
399                          %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4
400        ; B ^= C
401        XMM_OP_X4 pxor, %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, \
402                          %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4
403
404        ; B <<< 7
405        XMM_ROLS_X4 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, 7, \
406                    %%XMM_DWORD_D1
407%endmacro
408
409;;
410;; Encodes 4 Chacha20 states, outputting 256 bytes of keystream
411;; Data still needs to be transposed to get the keystream in the correct order
412;;
413%macro GENERATE_256_KS 16
414%define %%XMM_DWORD_0   %1  ;; [out] XMM register to contain encoded dword 0 of the 4 Chacha20 states
415%define %%XMM_DWORD_1   %2  ;; [out] XMM register to contain encoded dword 1 of the 4 Chacha20 states
416%define %%XMM_DWORD_2   %3  ;; [out] XMM register to contain encoded dword 2 of the 4 Chacha20 states
417%define %%XMM_DWORD_3   %4  ;; [out] XMM register to contain encoded dword 3 of the 4 Chacha20 states
418%define %%XMM_DWORD_4   %5  ;; [out] XMM register to contain encoded dword 4 of the 4 Chacha20 states
419%define %%XMM_DWORD_5   %6  ;; [out] XMM register to contain encoded dword 5 of the 4 Chacha20 states
420%define %%XMM_DWORD_6   %7  ;; [out] XMM register to contain encoded dword 6 of the 4 Chacha20 states
421%define %%XMM_DWORD_7   %8  ;; [out] XMM register to contain encoded dword 7 of the 4 Chacha20 states
422%define %%XMM_DWORD_8   %9  ;; [out] XMM register to contain encoded dword 8 of the 4 Chacha20 states
423%define %%XMM_DWORD_9  %10  ;; [out] XMM register to contain encoded dword 9 of the 4 Chacha20 states
424%define %%XMM_DWORD_10 %11  ;; [out] XMM register to contain encoded dword 10 of the 4 Chacha20 states
425%define %%XMM_DWORD_11 %12  ;; [out] XMM register to contain encoded dword 11 of the 4 Chacha20 states
426%define %%XMM_DWORD_12 %13  ;; [out] XMM register to contain encoded dword 12 of the 4 Chacha20 states
427%define %%XMM_DWORD_13 %14  ;; [out] XMM register to contain encoded dword 13 of the 4 Chacha20 states
428%define %%XMM_DWORD_14 %15  ;; [out] XMM register to contain encoded dword 14 of the 4 Chacha20 states
429%define %%XMM_DWORD_15 %16  ;; [out] XMM register to contain encoded dword 15 of the 4 Chacha20 states
430
431%assign i 0
432%rep 16
433        movdqa  APPEND(%%XMM_DWORD_, i), [rsp + _STATE + 16*i]
434%assign i (i + 1)
435%endrep
436
437%rep 10
438        CHACHA20_ROUND %%XMM_DWORD_0, %%XMM_DWORD_1, %%XMM_DWORD_2, %%XMM_DWORD_3, \
439                       %%XMM_DWORD_4, %%XMM_DWORD_5, %%XMM_DWORD_6, %%XMM_DWORD_7, \
440                       %%XMM_DWORD_8, %%XMM_DWORD_9, %%XMM_DWORD_10, %%XMM_DWORD_11, \
441                       %%XMM_DWORD_12, %%XMM_DWORD_13, %%XMM_DWORD_14, %%XMM_DWORD_15
442
443        CHACHA20_ROUND %%XMM_DWORD_0, %%XMM_DWORD_1, %%XMM_DWORD_2, %%XMM_DWORD_3, \
444                       %%XMM_DWORD_5, %%XMM_DWORD_6, %%XMM_DWORD_7, %%XMM_DWORD_4, \
445                       %%XMM_DWORD_10, %%XMM_DWORD_11, %%XMM_DWORD_8, %%XMM_DWORD_9, \
446                       %%XMM_DWORD_15, %%XMM_DWORD_12, %%XMM_DWORD_13, %%XMM_DWORD_14
447%endrep
448
449%assign i 0
450%rep 16
451        paddd   APPEND(%%XMM_DWORD_, i), [rsp + _STATE + 16*i]
452%assign i (i + 1)
453%endrep
454%endmacro
455
456align 32
457MKGLOBAL(submit_job_chacha20_enc_dec_sse,function,internal)
458submit_job_chacha20_enc_dec_sse:
459
460%define src     r8
461%define dst     r9
462%define len     r10
463%define iv      r11
464%define keys    rdx
465%define off     rax
466%define tmp     iv
467%define tmp2    keys
468
469        ; Read pointers and length
470        mov     len, [job + _msg_len_to_cipher_in_bytes]
471
472        ; Check if there is nothing to encrypt
473        or      len, len
474        jz      exit
475
476        mov     keys, [job + _enc_keys]
477        mov     iv, [job + _iv]
478        mov     src, [job + _src]
479        add     src, [job + _cipher_start_src_offset_in_bytes]
480        mov     dst, [job + _dst]
481
482        mov     rax, rsp
483        sub     rsp, STACK_SIZE
484        and     rsp, -16
485        mov     [rsp + _RSP_SAVE], rax ; save RSP
486
487        xor     off, off
488
489        ; If less than or equal to 64*2 bytes, prepare directly states for
490        ; up to 2 blocks
491        cmp     len, 64*2
492        jbe     check_1_or_2_blocks_left
493
494        ; Prepare first 4 chacha states
495        movdqa  xmm0, [rel constants0]
496        movdqa  xmm1, [rel constants1]
497        movdqa  xmm2, [rel constants2]
498        movdqa  xmm3, [rel constants3]
499
500        ; Broadcast 8 dwords from key into XMM4-11
501        movdqu  xmm12, [keys]
502        movdqu  xmm15, [keys + 16]
503        pshufd  xmm4, xmm12, 0x0
504        pshufd  xmm5, xmm12, 0x55
505        pshufd  xmm6, xmm12, 0xAA
506        pshufd  xmm7, xmm12, 0xFF
507        pshufd  xmm8, xmm15, 0x0
508        pshufd  xmm9, xmm15, 0x55
509        pshufd  xmm10, xmm15, 0xAA
510        pshufd  xmm11, xmm15, 0xFF
511
512        ; Broadcast 3 dwords from IV into XMM13-15
513        movd    xmm13, [iv]
514        movd    xmm14, [iv + 4]
515        pshufd  xmm13, xmm13, 0
516        pshufd  xmm14, xmm14, 0
517        movd    xmm15, [iv + 8]
518        pshufd  xmm15, xmm15, 0
519
520        ; Set block counters for first 4 Chacha20 states
521        movdqa  xmm12, [rel dword_1_4]
522
523%assign i 0
524%rep 16
525        movdqa  [rsp + _STATE + 16*i], xmm %+ i
526%assign i (i + 1)
527%endrep
528
529        cmp     len, 64*4
530        jb      exit_loop
531
532align 32
533start_loop:
534
535        ; Generate 256 bytes of keystream
536        GENERATE_256_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
537                        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15
538
539        ;; Transpose state to get keystream and XOR with plaintext
540        ;; to get ciphertext
541
542        ; Save registers to be used as temp registers
543        movdqa [rsp + _XMM_SAVE], xmm14
544        movdqa [rsp + _XMM_SAVE + 16], xmm15
545
546        ; Transpose to get 0-63 bytes of KS
547        TRANSPOSE4_U32 xmm0, xmm1, xmm2, xmm3, xmm14, xmm15
548
549        ; xmm14, xmm1, xmm0, xmm3
550        ; xmm2, xmm15 free to use
551        movdqu  xmm2, [src + off]
552        movdqu  xmm15, [src + off + 16*4]
553        pxor    xmm14, xmm2
554        pxor    xmm1, xmm15
555        movdqu  [dst + off], xmm14
556        movdqu  [dst + off + 16*4], xmm1
557
558        movdqu  xmm2, [src + off + 16*8]
559        movdqu  xmm15, [src + off + 16*12]
560        pxor    xmm0, xmm2
561        pxor    xmm3, xmm15
562        movdqu  [dst + off + 16*8], xmm0
563        movdqu  [dst + off + 16*12], xmm3
564
565        ; Restore registers and use xmm0, xmm1 now that they are free
566        movdqa xmm14, [rsp + _XMM_SAVE]
567        movdqa xmm15, [rsp + _XMM_SAVE + 16]
568
569        ; Transpose to get bytes 64-127 of KS
570        TRANSPOSE4_U32 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
571
572        ; xmm0, xmm5, xmm4, xmm7
573        ; xmm6, xmm1 free to use
574        movdqu  xmm6, [src + off + 16]
575        movdqu  xmm1, [src + off + 16*5]
576        pxor    xmm0, xmm6
577        pxor    xmm5, xmm1
578        movdqu  [dst + off + 16], xmm0
579        movdqu  [dst + off + 16*5], xmm5
580
581        movdqu  xmm6, [src + off + 16*9]
582        movdqu  xmm1, [src + off + 16*13]
583        pxor    xmm4, xmm6
584        pxor    xmm7, xmm1
585        movdqu  [dst + off + 16*9], xmm4
586        movdqu  [dst + off + 16*13], xmm7
587
588        ; Transpose to get bytes 128-191 of KS
589        TRANSPOSE4_U32 xmm8, xmm9, xmm10, xmm11, xmm0, xmm1
590
591        ; xmm0, xmm9, xmm8, xmm11
592        ; xmm10, xmm1 free to use
593        movdqu  xmm10, [src + off + 16*2]
594        movdqu  xmm1, [src + off + 16*6]
595        pxor    xmm0, xmm10
596        pxor    xmm9, xmm1
597        movdqu  [dst + off + 16*2], xmm0
598        movdqu  [dst + off + 16*6], xmm9
599
600        movdqu  xmm10, [src + off + 16*10]
601        movdqu  xmm1, [src + off + 16*14]
602        pxor    xmm8, xmm10
603        pxor    xmm11, xmm1
604        movdqu  [dst + off + 16*10], xmm8
605        movdqu  [dst + off + 16*14], xmm11
606
607        ; Transpose to get bytes 192-255 of KS
608        TRANSPOSE4_U32 xmm12, xmm13, xmm14, xmm15, xmm0, xmm1
609
610        ; xmm0, xmm13, xmm12, xmm15
611        ; xmm14, xmm1 free to use
612        movdqu  xmm14, [src + off + 16*3]
613        movdqu  xmm1, [src + off + 16*7]
614        pxor    xmm0, xmm14
615        pxor    xmm13, xmm1
616        movdqu  [dst + off + 16*3], xmm0
617        movdqu  [dst + off + 16*7], xmm13
618
619        movdqu  xmm14, [src + off + 16*11]
620        movdqu  xmm1, [src + off + 16*15]
621        pxor    xmm12, xmm14
622        pxor    xmm15, xmm1
623        movdqu  [dst + off + 16*11], xmm12
624        movdqu  [dst + off + 16*15], xmm15
625        ; Update remaining length
626        sub     len, 64*4
627        add     off, 64*4
628
629        ; Update counter values
630        movdqa xmm12, [rsp + 16*12]
631        paddd  xmm12, [rel dword_4]
632        movdqa [rsp + 16*12], xmm12
633
634        cmp     len, 64*4
635        jae     start_loop
636
637exit_loop:
638
639        ; Check if there are no more bytes to encrypt
640        or      len, len
641        jz      no_partial_block
642
643        cmp     len, 64*2
644        ja      more_than_2_blocks_left
645
646check_1_or_2_blocks_left:
647        cmp     len, 64
648        ja      two_blocks_left
649
650        ;; 1 block left
651
652        ; Get last block counter dividing offset by 64
653        shr     off, 6
654
655        ; Prepare next chacha state from IV, key
656        movdqu  xmm1, [keys]          ; Load key bytes 0-15
657        movdqu  xmm2, [keys + 16]     ; Load key bytes 16-31
658        ; Read nonce (12 bytes)
659        movq    xmm3, [iv]
660        pinsrd  xmm3, [iv + 8], 2
661        pslldq  xmm3, 4
662        movdqa  xmm0, [rel constants]
663
664        ; Insert next block count
665        inc     DWORD(off)
666        movd    xmm4, DWORD(off)
667        por     xmm3, xmm4
668        dec     DWORD(off)
669        shl     off, 6 ; Restore offset
670
671        ; Generate 64 bytes of keystream
672        GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm9, xmm10, xmm11, \
673                           xmm12, xmm13
674
675        cmp     len, 64
676        jne     less_than_64
677
678        ;; Exactly 64 bytes left
679
680        ; Load plaintext, XOR with KS and store ciphertext
681        movdqu  xmm14, [src + off]
682        movdqu  xmm15, [src + off + 16]
683        pxor    xmm14, xmm9
684        pxor    xmm15, xmm10
685        movdqu  [dst + off], xmm14
686        movdqu  [dst + off + 16], xmm15
687
688        movdqu  xmm14, [src + off + 16*2]
689        movdqu  xmm15, [src + off + 16*3]
690        pxor    xmm14, xmm11
691        pxor    xmm15, xmm12
692        movdqu  [dst + off + 16*2], xmm14
693        movdqu  [dst + off + 16*3], xmm15
694
695        jmp     no_partial_block
696
697less_than_64:
698
699        cmp     len, 48
700        jb      less_than_48
701
702        ; Load plaintext and XOR with keystream
703        movdqu  xmm13, [src + off]
704        movdqu  xmm14, [src + off + 16]
705        movdqu  xmm15, [src + off + 32]
706
707        pxor    xmm13, xmm9
708        pxor    xmm14, xmm10
709        pxor    xmm15, xmm11
710
711        ; Store resulting ciphertext
712        movdqu [dst + off], xmm13
713        movdqu [dst + off + 16], xmm14
714        movdqu [dst + off + 32], xmm15
715
716        ; Store last KS in xmm9, for partial block
717        movdqu  xmm9, xmm12
718
719        sub     len, 48
720        add     off, 48
721
722        jmp     check_partial
723less_than_48:
724        cmp     len, 32
725        jb      less_than_32
726
727        ; Load plaintext and XOR with keystream
728        movdqu  xmm13, [src + off]
729        movdqu  xmm14, [src + off + 16]
730
731        pxor    xmm13, xmm9
732        pxor    xmm14, xmm10
733
734        ; Store resulting ciphertext
735        movdqu [dst + off], xmm13
736        movdqu [dst + off + 16], xmm14
737
738        ; Store last KS in xmm9, for partial block
739        movdqu  xmm9, xmm11
740
741        sub     len, 32
742        add     off, 32
743
744        jmp     check_partial
745
746less_than_32:
747        cmp     len, 16
748        jb      check_partial
749
750        ; Load plaintext and XOR with keystream
751        movdqu  xmm13, [src + off]
752
753        pxor    xmm13, xmm9
754
755        ; Store resulting ciphertext
756        movdqu [dst + off], xmm13
757
758        ; Store last KS in xmm9, for partial block
759        movdqu  xmm9, xmm10
760
761        sub     len, 16
762        add     off, 16
763
764check_partial:
765        or      len, len
766        jz      no_partial_block
767
768        add     src, off
769        add     dst, off
770        ; Load plaintext
771        simd_load_sse_15_1 xmm8, src, len
772
773        ; XOR KS with plaintext and store resulting ciphertext
774        pxor    xmm8, xmm9
775
776        simd_store_sse_15 dst, xmm8, len, tmp, tmp2
777
778        jmp     no_partial_block
779
780two_blocks_left:
781
782        ; Get last block counter dividing offset by 64
783        shr     off, 6
784
785        ; Prepare next 2 chacha states from IV, key
786        movdqu  xmm1, [keys]          ; Load key bytes 0-15
787        movdqu  xmm2, [keys + 16]     ; Load key bytes 16-31
788        ; Read nonce (12 bytes)
789        movq    xmm3, [iv]
790        pinsrd  xmm3, [iv + 8], 2
791        pslldq  xmm3, 4
792        movdqa  xmm0, [rel constants]
793
794        movdqa  xmm8, xmm3
795
796        ; Insert next block counts
797        inc     DWORD(off)
798        movd    xmm4, DWORD(off)
799        por     xmm3, xmm4
800        inc     DWORD(off)
801        movd    xmm5, DWORD(off)
802        por     xmm8, xmm5
803        sub     off, 2
804        shl     off, 6 ; Restore offset
805
806        ; Generate 128 bytes of keystream
807        GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
808                           xmm13, xmm8, xmm9, xmm10, xmm11, xmm12
809
810        cmp     len, 128
811        jb      between_64_127
812
813        ; Load plaintext, XOR with KS and store ciphertext
814        movdqu  xmm14, [src + off]
815        movdqu  xmm15, [src + off + 16]
816        pxor    xmm14, xmm4
817        pxor    xmm15, xmm5
818        movdqu  [dst + off], xmm14
819        movdqu  [dst + off + 16], xmm15
820
821        movdqu  xmm14, [src + off + 16*2]
822        movdqu  xmm15, [src + off + 16*3]
823        pxor    xmm14, xmm6
824        pxor    xmm15, xmm7
825        movdqu  [dst + off + 16*2], xmm14
826        movdqu  [dst + off + 16*3], xmm15
827
828        movdqu  xmm14, [src + off + 16*4]
829        movdqu  xmm15, [src + off + 16*5]
830        pxor    xmm14, xmm9
831        pxor    xmm15, xmm10
832        movdqu  [dst + off + 16*4], xmm14
833        movdqu  [dst + off + 16*5], xmm15
834
835        movdqu  xmm14, [src + off + 16*6]
836        movdqu  xmm15, [src + off + 16*7]
837        pxor    xmm14, xmm11
838        pxor    xmm15, xmm12
839        movdqu  [dst + off + 16*6], xmm14
840        movdqu  [dst + off + 16*7], xmm15
841
842        jmp     no_partial_block
843
844between_64_127:
845        ; Load plaintext, XOR with KS and store ciphertext for first 64 bytes
846        movdqu  xmm14, [src + off]
847        movdqu  xmm15, [src + off + 16]
848        pxor    xmm14, xmm4
849        pxor    xmm15, xmm5
850        movdqu  [dst + off], xmm14
851        movdqu  [dst + off + 16], xmm15
852
853        movdqu  xmm14, [src + off + 16*2]
854        movdqu  xmm15, [src + off + 16*3]
855        pxor    xmm14, xmm6
856        pxor    xmm15, xmm7
857        movdqu  [dst + off + 16*2], xmm14
858        movdqu  [dst + off + 16*3], xmm15
859
860        sub     len, 64
861        add     off, 64
862        ; Handle rest up to 63 bytes in "less_than_64"
863        jmp     less_than_64
864
865more_than_2_blocks_left:
866
867        ;; First generate 128 bytes of KS to encrypt next 128 bytes
868
869        ; Get last block counter dividing offset by 64
870        shr     off, 6
871
872        ; Prepare next 2 chacha states from IV, key
873        movdqu  xmm1, [keys]          ; Load key bytes 0-15
874        movdqu  xmm2, [keys + 16]     ; Load key bytes 16-31
875        ; Read nonce (12 bytes)
876        movq    xmm3, [iv]
877        pinsrd  xmm3, [iv + 8], 2
878        pslldq  xmm3, 4
879        movdqa  xmm0, [rel constants]
880
881        movdqa  xmm8, xmm3
882
883        ; Insert next block counts
884        inc     DWORD(off)
885        movd    xmm4, DWORD(off)
886        por     xmm3, xmm4
887        inc     DWORD(off)
888        movd    xmm5, DWORD(off)
889        por     xmm8, xmm5
890        sub     off, 2
891        shl     off, 6 ; Restore offset
892
893        ; Generate 128 bytes of keystream
894        GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
895                           xmm13, xmm8, xmm9, xmm10, xmm11, xmm12
896
897        ; Load plaintext, XOR with KS and store ciphertext
898        movdqu  xmm14, [src + off]
899        movdqu  xmm15, [src + off + 16]
900        pxor    xmm14, xmm4
901        pxor    xmm15, xmm5
902        movdqu  [dst + off], xmm14
903        movdqu  [dst + off + 16], xmm15
904
905        movdqu  xmm14, [src + off + 16*2]
906        movdqu  xmm15, [src + off + 16*3]
907        pxor    xmm14, xmm6
908        pxor    xmm15, xmm7
909        movdqu  [dst + off + 16*2], xmm14
910        movdqu  [dst + off + 16*3], xmm15
911
912        movdqu  xmm14, [src + off + 16*4]
913        movdqu  xmm15, [src + off + 16*5]
914        pxor    xmm14, xmm9
915        pxor    xmm15, xmm10
916        movdqu  [dst + off + 16*4], xmm14
917        movdqu  [dst + off + 16*5], xmm15
918
919        movdqu  xmm14, [src + off + 16*6]
920        movdqu  xmm15, [src + off + 16*7]
921        pxor    xmm14, xmm11
922        pxor    xmm15, xmm12
923        movdqu  [dst + off + 16*6], xmm14
924        movdqu  [dst + off + 16*7], xmm15
925
926        sub     len, 128
927        add     off, 128
928
929        jmp     check_1_or_2_blocks_left
930
931no_partial_block:
932
933%ifdef SAFE_DATA
934        clear_all_xmms_sse_asm
935        ; Clear stack frame
936%assign i 0
937%rep 16
938        movdqa  [rsp + _STATE + 16*i], xmm0
939%assign i (i + 1)
940%endrep
941        movdqa  [rsp + _XMM_SAVE], xmm0
942        movdqa  [rsp + _XMM_SAVE + 16], xmm0
943%endif
944
945        mov     rsp, [rsp + _RSP_SAVE]
946
947exit:
948        mov     rax, job
949        or      dword [rax + _status], STS_COMPLETED_AES
950
951        ret
952
953;;
954;; void poly1305_key_gen_sse(IMB_JOB *job, void *poly_key)
955align 32
956MKGLOBAL(poly1305_key_gen_sse,function,internal)
957poly1305_key_gen_sse:
958        ;; prepare chacha state from IV, key
959        mov     rax, [job + _enc_keys]
960        movdqa  xmm0, [rel constants]
961        movdqu  xmm1, [rax]          ; Load key bytes 0-15
962        movdqu  xmm2, [rax + 16]     ; Load key bytes 16-31
963        ;;  copy nonce (12 bytes)
964        mov     rax, [job + _iv]
965        movq    xmm3, [rax]
966        pinsrd  xmm3, [rax + 8], 2
967        pslldq  xmm3, 4
968
969        ;; run one round of chacha20
970        GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
971
972        ;; clamp R and store poly1305 key
973        ;; R = KEY[0..15] & 0xffffffc0ffffffc0ffffffc0fffffff
974        pand    xmm4, [rel poly_clamp_r]
975        movdqu  [arg2 + 0 * 16], xmm4
976        movdqu  [arg2 + 1 * 16], xmm5
977
978%ifdef SAFE_DATA
979        clear_all_xmms_sse_asm
980%endif
981        ret
982
983%ifdef LINUX
984section .note.GNU-stack noalloc noexec nowrite progbits
985%endif
986