1# This submission to NSS is to be made available under the terms of the
2# Mozilla Public License, v. 2.0. You can obtain one at //mozilla.org/MPL/2.0/
3# Copyright(c) 2021, Niels Möller and Mamone Tarsha
4
5# Registers:
6
7.set SP, 1
8.set TOCP, 2
9
10.macro VEC_LOAD_DATA   VR, DATA, GPR
11    addis        \GPR, 2, \DATA@got@ha
12    ld           \GPR, \DATA@got@l(\GPR)
13    lvx          \VR, 0, \GPR
14.endm
15
16.macro VEC_LOAD   VR, GPR, IDX
17    lxvd2x       \VR+32, \IDX, \GPR
18    vperm        \VR, \VR, \VR, SWAP_MASK
19.endm
20
21.macro VEC_LOAD_INC   VR, GPR, IDX
22    lxvd2x       \VR+32, \IDX, \GPR
23    addi         \IDX,\IDX,16
24    vperm        \VR, \VR, \VR, SWAP_MASK
25.endm
26
27.macro VEC_STORE   VR, GPR, IDX
28    vperm        \VR, \VR, \VR, SWAP_MASK
29    stxvd2x      \VR+32, \IDX, \GPR
30.endm
31
32# 0 < LEN < 16, pad the remaining bytes with zeros
33.macro LOAD_LEN  DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
34    li           \TMP0, 0
35    li           \VAL1, 0
36    li           \VAL0, 0
37    andi.        \TMP1, \LEN, 8
38    beq          1f
39    ldbrx        \VAL1, 0, \DATA
40    li           \TMP0, 8
411:
42    andi.        \TMP1, \LEN, 7
43    beq          3f
44    li           \TMP1, 56
452:
46    lbzx         \TMP2, \TMP0, \DATA
47    sld          \TMP2, \TMP2, \TMP1
48    subi         \TMP1, \TMP1, 8
49    or           \VAL0, \VAL0, \TMP2
50    addi         \TMP0, \TMP0, 1
51    cmpld        \TMP0, \LEN
52    bne          2b
53    andi.        \TMP1, \LEN, 8
54    bne          3f
55    mr           \VAL1, \VAL0
56    li           \VAL0, 0
573:
58.endm
59
60# 0 < LEN < 16
61.macro STORE_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
62    andi.        \TMP1, \LEN, 8
63    beq          1f
64    stdbrx       \VAL1, 0, \DATA
65    li           \TMP0, 8
66    b            2f
671:
68    li           \TMP0, 0
69    mr           \VAL0, \VAL1
702:
71    andi.        \TMP1, \LEN, 7
72    beq          4f
73    li           \TMP1, 56
743:
75    srd          \TMP2, \VAL0, \TMP1
76    subi         \TMP1, \TMP1, 8
77    stbx         \TMP2, \TMP0, \DATA
78    addi         \TMP0, \TMP0, 1
79    cmpld        \TMP0, \LEN
80    bne          3b
814:
82.endm
83
84.text
85
86################################################################################
87# Generates the H table
88# void ppc_aes_gcmINIT(uint8_t Htbl[16*8], uint32_t *KS, int NR);
89.globl	ppc_aes_gcmINIT
90.type	ppc_aes_gcmINIT,@function
91.align	5
92ppc_aes_gcmINIT:
93addis	TOCP,12,(.TOC.-ppc_aes_gcmINIT)@ha
94addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmINIT)@l
95.localentry	ppc_aes_gcmINIT, .-ppc_aes_gcmINIT
96
97.set Htbl, 3
98.set KS, 4
99.set NR, 5
100
101.set ZERO, 19
102.set MSB, 18
103.set ONE, 17
104.set SWAP_MASK, 0
105.set POLY, 1
106.set K, 2
107.set H, 3
108.set H2, 4
109.set H3, 5
110.set H4, 6
111.set HP, 7
112.set HS, 8
113.set R, 9
114.set F, 10
115.set T, 11
116.set H1M, 12
117.set H1L, 13
118.set H2M, 14
119.set H2L, 15
120.set H3M, 16
121.set H3L, 17
122.set H4M, 18
123.set H4L, 19
124
125    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 6
126    VEC_LOAD_DATA POLY, .Lpoly, 6
127
128    li           6, 0
129    VEC_LOAD_INC H, KS, 6
130    VEC_LOAD_INC K, KS, 6
131    vcipher      H, H, K
132    VEC_LOAD_INC K, KS, 6
133    vcipher      H, H, K
134    VEC_LOAD_INC K, KS, 6
135    vcipher      H, H, K
136    VEC_LOAD_INC K, KS, 6
137    vcipher      H, H, K
138    VEC_LOAD_INC K, KS, 6
139    vcipher      H, H, K
140    VEC_LOAD_INC K, KS, 6
141    vcipher      H, H, K
142    VEC_LOAD_INC K, KS, 6
143    vcipher      H, H, K
144    VEC_LOAD_INC K, KS, 6
145    vcipher      H, H, K
146    VEC_LOAD_INC K, KS, 6
147    vcipher      H, H, K
148    cmpwi        NR, 10
149    beq          .LH_done
150    VEC_LOAD_INC K, KS, 6
151    vcipher      H, H, K
152    VEC_LOAD_INC K, KS, 6
153    vcipher      H, H, K
154    cmpwi        NR, 12
155    beq          .LH_done
156    VEC_LOAD_INC K, KS, 6
157    vcipher      H, H, K
158    VEC_LOAD_INC K, KS, 6
159    vcipher      H, H, K
160
161.LH_done:
162    VEC_LOAD     K, KS, 6
163    vcipherlast  H, H, K
164
165    vupkhsb      MSB, H
166    vspltisb     ONE, 1
167    vspltb       MSB, MSB, 0
168    vsl          H, H, ONE
169    vand         MSB, MSB, POLY
170    vxor         ZERO, ZERO, ZERO
171    vxor         H, H, MSB
172    vsldoi       POLY, ZERO, POLY, 8
173
174    vpmsumd      HP, H, POLY
175    vsldoi       HS, H, H, 8
176    vxor         HP, HP, HS
177    vsldoi       H1L, HP, HS, 8
178    vsldoi       H1M, HS, HP, 8
179    vsldoi       H1L, H1L, H1L, 8
180
181    # calculate H^2
182
183    vpmsumd      F, H, H1L
184    vpmsumd      R, H, H1M
185
186    vpmsumd      T, F, POLY
187    vsldoi       H2, F, F, 8
188    vxor         R, R, T
189    vxor         H2, H2, R
190
191    vpmsumd      HP, H2, POLY
192    vsldoi       HS, H2, H2, 8
193    vxor         HP, HP, HS
194    vsldoi       H2L, HP, HS, 8
195    vsldoi       H2M, HS, HP, 8
196    vsldoi       H2L, H2L, H2L, 8
197
198    # calculate H^3
199
200    vpmsumd      F, H2, H1L
201    vpmsumd      R, H2, H1M
202
203    vpmsumd      T, F, POLY
204    vsldoi       H3, F, F, 8
205    vxor         R, R, T
206    vxor         H3, H3, R
207
208    vpmsumd      HP, H3, POLY
209    vsldoi       HS, H3, H3, 8
210    vxor         HP, HP, HS
211    vsldoi       H3L, HP, HS, 8
212    vsldoi       H3M, HS, HP, 8
213    vsldoi       H3L, H3L, H3L, 8
214
215    # calculate H^4
216
217    vpmsumd      F, H2, H2L
218    vpmsumd      R, H2, H2M
219
220    vpmsumd      T, F, POLY
221    vsldoi       H4, F, F, 8
222    vxor         R, R, T
223    vxor         H4, H4, R
224
225    vpmsumd      HP, H4, POLY
226    vsldoi       HS, H4, H4, 8
227    vxor         HP, HP, HS
228    vsldoi       H4L, HP, HS, 8
229    vsldoi       H4M, HS, HP, 8
230    vsldoi       H4L, H4L, H4L, 8
231
232    li           8, 16*1
233    li           9, 16*2
234    li           10, 16*3
235    stxvd2x      H1L+32, 0, Htbl
236    stxvd2x      H1M+32, 8, Htbl
237    stxvd2x      H2L+32, 9, Htbl
238    stxvd2x      H2M+32, 10, Htbl
239    li           7, 16*4
240    li           8, 16*5
241    li           9, 16*6
242    li           10, 16*7
243    stxvd2x      H3L+32, 7, Htbl
244    stxvd2x      H3M+32, 8, Htbl
245    stxvd2x      H4L+32, 9, Htbl
246    stxvd2x      H4M+32, 10, Htbl
247
248    blr
249.size ppc_aes_gcmINIT, . - ppc_aes_gcmINIT
250
251################################################################################
252# Authenticate only
253# void ppc_aes_gcmHASH(uint8_t Htbl[16*8], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
254.globl	ppc_aes_gcmHASH
255.type	ppc_aes_gcmHASH,@function
256.align	5
257ppc_aes_gcmHASH:
258addis	TOCP,12,(.TOC.-ppc_aes_gcmHASH)@ha
259addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmHASH)@l
260.localentry	ppc_aes_gcmHASH, .-ppc_aes_gcmHASH
261
262.set Htbl, 3
263.set AAD, 4
264.set Alen, 5
265.set Tp, 6
266
267.set SWAP_MASK, 0
268.set POLY, 1
269.set D, 2
270.set C0, 3
271.set C1, 4
272.set C2, 5
273.set C3, 6
274.set T, 7
275.set R, 8
276.set F, 9
277.set R2, 10
278.set F2, 11
279.set R3, 12
280.set F3, 13
281.set R4, 14
282.set F4, 15
283.set H1M, 16
284.set H1L, 17
285.set H2M, 18
286.set H2L, 19
287.set H3M, 28
288.set H3L, 29
289.set H4M, 30
290.set H4L, 31
291
292    # store non-volatile vector registers
293    addi         7, SP, -16
294    stvx         31, 0, 7
295    addi         7, SP, -32
296    stvx         30, 0, 7
297    addi         7, SP, -48
298    stvx         29, 0, 7
299    addi         7, SP, -64
300    stvx         28, 0, 7
301
302    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 7
303    VEC_LOAD_DATA POLY, .Lpoly_r, 7
304
305    VEC_LOAD     D, Tp, 0
306
307    # --- process 4 blocks ---
308
309    srdi.        7, Alen, 6               # 4-blocks loop count
310    beq          .L2x
311
312    mtctr        7                        # set counter register
313
314    # load table elements
315    li           8, 1*16
316    li           9, 2*16
317    li           10, 3*16
318    lxvd2x       H1L+32, 0, Htbl
319    lxvd2x       H1M+32, 8, Htbl
320    lxvd2x       H2L+32, 9, Htbl
321    lxvd2x       H2M+32, 10, Htbl
322    li           7, 4*16
323    li           8, 5*16
324    li           9, 6*16
325    li           10, 7*16
326    lxvd2x       H3L+32, 7, Htbl
327    lxvd2x       H3M+32, 8, Htbl
328    lxvd2x       H4L+32, 9, Htbl
329    lxvd2x       H4M+32, 10, Htbl
330
331    li           8, 0x10
332    li           9, 0x20
333    li           10, 0x30
334.align 5
335.L4x_loop:
336    # load input
337    lxvd2x       C0+32, 0, AAD
338    lxvd2x       C1+32, 8, AAD
339    lxvd2x       C2+32, 9, AAD
340    lxvd2x       C3+32, 10, AAD
341
342    vperm        C0, C0, C0, SWAP_MASK
343    vperm        C1, C1, C1, SWAP_MASK
344    vperm        C2, C2, C2, SWAP_MASK
345    vperm        C3, C3, C3, SWAP_MASK
346
347    # digest combining
348    vxor         C0, C0, D
349
350    # polynomial multiplication
351    vpmsumd      F2, H3L, C1
352    vpmsumd      R2, H3M, C1
353    vpmsumd      F3, H2L, C2
354    vpmsumd      R3, H2M, C2
355    vpmsumd      F4, H1L, C3
356    vpmsumd      R4, H1M, C3
357    vpmsumd      F, H4L, C0
358    vpmsumd      R, H4M, C0
359
360    # deferred recombination of partial products
361    vxor         F3, F3, F4
362    vxor         R3, R3, R4
363    vxor         F, F, F2
364    vxor         R, R, R2
365    vxor         F, F, F3
366    vxor         R, R, R3
367
368    # reduction
369    vpmsumd      T, F, POLY
370    vsldoi       D, F, F, 8
371    vxor         R, R, T
372    vxor         D, R, D
373
374    addi         AAD, AAD, 0x40
375    bdnz         .L4x_loop
376
377    clrldi       Alen, Alen, 58
378.L2x:
379    # --- process 2 blocks ---
380
381    srdi.        7, Alen, 5
382    beq          .L1x
383
384    # load table elements
385    li           8, 1*16
386    li           9, 2*16
387    li           10, 3*16
388    lxvd2x       H1L+32, 0, Htbl
389    lxvd2x       H1M+32, 8, Htbl
390    lxvd2x       H2L+32, 9, Htbl
391    lxvd2x       H2M+32, 10, Htbl
392
393    # load input
394    li           10, 0x10
395    lxvd2x       C0+32, 0, AAD
396    lxvd2x       C1+32, 10, AAD
397
398    vperm        C0, C0, C0, SWAP_MASK
399    vperm        C1, C1, C1, SWAP_MASK
400
401    # previous digest combining
402    vxor         C0, C0, D
403
404    # polynomial multiplication
405    vpmsumd      F2, H1L, C1
406    vpmsumd      R2, H1M, C1
407    vpmsumd      F, H2L, C0
408    vpmsumd      R, H2M, C0
409
410    # deferred recombination of partial products
411    vxor         F, F, F2
412    vxor         R, R, R2
413
414    # reduction
415    vpmsumd      T, F, POLY
416    vsldoi       D, F, F, 8
417    vxor         R, R, T
418    vxor         D, R, D
419
420    addi         AAD, AAD, 0x20
421    clrldi       Alen, Alen, 59
422.L1x:
423    # --- process 1 block ---
424
425    srdi.        7, Alen, 4
426    beq          .Ltail
427
428    # load table elements
429    li           8, 1*16
430    lxvd2x       H1L+32, 0, Htbl
431    lxvd2x       H1M+32, 8, Htbl
432
433    # load input
434    lxvd2x       C0+32, 0, AAD
435
436    vperm        C0, C0, C0, SWAP_MASK
437
438    # previous digest combining
439    vxor         C0, C0, D
440
441    # polynomial multiplication
442    vpmsumd      F, H1L, C0
443    vpmsumd      R, H1M, C0
444
445    # reduction
446    vpmsumd      T, F, POLY
447    vsldoi       D, F, F, 8
448    vxor         R, R, T
449    vxor         D, R, D
450
451    addi         AAD, AAD, 0x10
452    clrldi       Alen, Alen, 60
453
454.Ltail:
455    cmpldi       Alen, 0
456    beq          .Lh_done
457    # --- process the final partial block ---
458
459    # load table elements
460    li           8, 1*16
461    lxvd2x       H1L+32, 0, Htbl
462    lxvd2x       H1M+32, 8, Htbl
463
464    LOAD_LEN     AAD, Alen, 10, 9, 3, 7, 8
465    mtvrd        C0, 10
466    mtvrd        C1, 9
467    xxmrghd      C0+32, C0+32, C1+32
468
469    # previous digest combining
470    vxor         C0, C0, D
471
472    # polynomial multiplication
473    vpmsumd      F, H1L, C0
474    vpmsumd      R, H1M, C0
475
476    # reduction
477    vpmsumd      T, F, POLY
478    vsldoi       D, F, F, 8
479    vxor         R, R, T
480    vxor         D, R, D
481.Lh_done:
482    VEC_STORE    D, Tp, 0
483
484    # restore non-volatile vector registers
485    addi         7, SP, -16
486    lvx          31, 0, 7
487    addi         7, SP, -32
488    lvx          30, 0, 7
489    addi         7, SP, -48
490    lvx          29, 0, 7
491    addi         7, SP, -64
492    lvx          28, 0, 7
493    blr
494.size ppc_aes_gcmHASH, . - ppc_aes_gcmHASH
495
496################################################################################
497# Generates the final GCM tag
498# void ppc_aes_gcmTAG(uint8_t Htbl[16*8], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
499.globl	ppc_aes_gcmTAG
500.type	ppc_aes_gcmTAG,@function
501.align	5
502ppc_aes_gcmTAG:
503addis	TOCP,12,(.TOC.-ppc_aes_gcmTAG)@ha
504addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmTAG)@l
505.localentry	ppc_aes_gcmTAG, .-ppc_aes_gcmTAG
506
507.set Htbl, 3
508.set Tp, 4
509.set Mlen, 5
510.set Alen, 6
511.set X0, 7
512.set TAG, 8
513
514.set SWAP_MASK, 0
515.set POLY, 1
516.set D, 2
517.set C0, 3
518.set C1, 4
519.set T, 5
520.set R, 6
521.set F, 7
522.set H1M, 8
523.set H1L, 9
524.set X, 10
525
526    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
527    VEC_LOAD_DATA POLY, .Lpoly_r, 9
528
529    VEC_LOAD     D, Tp, 0
530
531    # load table elements
532    li           9, 1*16
533    lxvd2x       H1L+32, 0, Htbl
534    lxvd2x       H1M+32, 9, Htbl
535
536    sldi         Alen, Alen, 3
537    sldi         Mlen, Mlen, 3
538    mtvrd        C0, Alen
539    mtvrd        C1, Mlen
540    xxmrghd      C0+32, C0+32, C1+32
541
542    # previous digest combining
543    vxor         C0, C0, D
544
545    # polynomial multiplication
546    vpmsumd      F, H1L, C0
547    vpmsumd      R, H1M, C0
548
549    # reduction
550    vpmsumd      T, F, POLY
551    vsldoi       D, F, F, 8
552    vxor         R, R, T
553    vxor         D, R, D
554
555    lxvd2x       X+32, 0, X0
556    vperm        D, D, D, SWAP_MASK
557    vxor         X, X, D
558    stxvd2x      X+32, 0, TAG
559
560    blr
561.size ppc_aes_gcmTAG, . - ppc_aes_gcmTAG
562
563################################################################################
564# Crypt only
565# void ppc_aes_gcmCRYPT(const uint8_t* PT, uint8_t* CT, uint64_t LEN, uint8_t *CTRP, uint32_t *KS, int NR);
566.globl	ppc_aes_gcmCRYPT
567.type	ppc_aes_gcmCRYPT,@function
568.align	5
569ppc_aes_gcmCRYPT:
570addis	TOCP,12,(.TOC.-ppc_aes_gcmCRYPT)@ha
571addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmCRYPT)@l
572.localentry	ppc_aes_gcmCRYPT, .-ppc_aes_gcmCRYPT
573
574.set PT, 3
575.set CT, 4
576.set LEN, 5
577.set CTRP, 6
578.set KS, 7
579.set NR, 8
580
581.set SWAP_MASK, 0
582.set K, 1
583.set CTR, 2
584.set CTR0, 3
585.set CTR1, 4
586.set CTR2, 5
587.set CTR3, 6
588.set CTR4, 7
589.set CTR5, 8
590.set CTR6, 9
591.set CTR7, 10
592.set ZERO, 11
593.set I1, 12
594.set I2, 13
595.set I3, 14
596.set I4, 15
597.set I5, 16
598.set I6, 17
599.set I7, 18
600.set I8, 19
601.set IN0, 24
602.set IN1, 25
603.set IN2, 26
604.set IN3, 27
605.set IN4, 28
606.set IN5, 29
607.set IN6, 30
608.set IN7, 31
609
610.macro ROUND_8
611    VEC_LOAD_INC K, KS, 10
612    vcipher      CTR0, CTR0, K
613    vcipher      CTR1, CTR1, K
614    vcipher      CTR2, CTR2, K
615    vcipher      CTR3, CTR3, K
616    vcipher      CTR4, CTR4, K
617    vcipher      CTR5, CTR5, K
618    vcipher      CTR6, CTR6, K
619    vcipher      CTR7, CTR7, K
620.endm
621
622.macro ROUND_4
623    VEC_LOAD_INC K, KS, 10
624    vcipher      CTR0, CTR0, K
625    vcipher      CTR1, CTR1, K
626    vcipher      CTR2, CTR2, K
627    vcipher      CTR3, CTR3, K
628.endm
629
630.macro ROUND_2
631    VEC_LOAD_INC K, KS, 10
632    vcipher      CTR0, CTR0, K
633    vcipher      CTR1, CTR1, K
634.endm
635
636.macro ROUND_1
637    VEC_LOAD_INC K, KS, 10
638    vcipher      CTR0, CTR0, K
639.endm
640
641    # store non-volatile general registers
642    std          31,-8(SP);
643    std          30,-16(SP);
644    std          29,-24(SP);
645    std          28,-32(SP);
646    std          27,-40(SP);
647    std          26,-48(SP);
648    std          25,-56(SP);
649
650    # store non-volatile vector registers
651    addi         9, SP, -80
652    stvx         31, 0, 9
653    addi         9, SP, -96
654    stvx         30, 0, 9
655    addi         9, SP, -112
656    stvx         29, 0, 9
657    addi         9, SP, -128
658    stvx         28, 0, 9
659    addi         9, SP, -144
660    stvx         27, 0, 9
661    addi         9, SP, -160
662    stvx         26, 0, 9
663    addi         9, SP, -176
664    stvx         25, 0, 9
665    addi         9, SP, -192
666    stvx         24, 0, 9
667
668    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
669
670    vxor         ZERO, ZERO, ZERO
671    vspltisb     I1, 1
672    vspltisb     I2, 2
673    vspltisb     I3, 3
674    vspltisb     I4, 4
675    vspltisb     I5, 5
676    vspltisb     I6, 6
677    vspltisb     I7, 7
678    vspltisb     I8, 8
679    vsldoi       I1, ZERO, I1, 1
680    vsldoi       I2, ZERO, I2, 1
681    vsldoi       I3, ZERO, I3, 1
682    vsldoi       I4, ZERO, I4, 1
683    vsldoi       I5, ZERO, I5, 1
684    vsldoi       I6, ZERO, I6, 1
685    vsldoi       I7, ZERO, I7, 1
686    vsldoi       I8, ZERO, I8, 1
687
688    VEC_LOAD     CTR, CTRP, 0
689
690    srdi.        9, LEN, 7
691    beq          .Lctr_4x
692
693    mtctr        9
694
695    li           25, 0x10
696    li           26, 0x20
697    li           27, 0x30
698    li           28, 0x40
699    li           29, 0x50
700    li           30, 0x60
701    li           31, 0x70
702
703.align 5
704.L8x_loop:
705    li           10, 0
706    VEC_LOAD_INC K, KS, 10
707
708    vadduwm      CTR1, CTR, I1
709    vadduwm      CTR2, CTR, I2
710    vadduwm      CTR3, CTR, I3
711    vadduwm      CTR4, CTR, I4
712    vadduwm      CTR5, CTR, I5
713    vadduwm      CTR6, CTR, I6
714    vadduwm      CTR7, CTR, I7
715
716    vxor         CTR0, CTR,  K
717    vxor         CTR1, CTR1, K
718    vxor         CTR2, CTR2, K
719    vxor         CTR3, CTR3, K
720    vxor         CTR4, CTR4, K
721    vxor         CTR5, CTR5, K
722    vxor         CTR6, CTR6, K
723    vxor         CTR7, CTR7, K
724
725    ROUND_8
726    ROUND_8
727    ROUND_8
728    ROUND_8
729    ROUND_8
730    ROUND_8
731    ROUND_8
732    ROUND_8
733    ROUND_8
734    cmpwi        NR, 10
735    beq          .Llast_8
736    ROUND_8
737    ROUND_8
738    cmpwi        NR, 12
739    beq          .Llast_8
740    ROUND_8
741    ROUND_8
742
743.Llast_8:
744    VEC_LOAD     K, KS, 10
745    vcipherlast  CTR0, CTR0, K
746    vcipherlast  CTR1, CTR1, K
747    vcipherlast  CTR2, CTR2, K
748    vcipherlast  CTR3, CTR3, K
749    vcipherlast  CTR4, CTR4, K
750    vcipherlast  CTR5, CTR5, K
751    vcipherlast  CTR6, CTR6, K
752    vcipherlast  CTR7, CTR7, K
753
754    lxvd2x       IN0+32, 0,  PT
755    lxvd2x       IN1+32, 25, PT
756    lxvd2x       IN2+32, 26, PT
757    lxvd2x       IN3+32, 27, PT
758    lxvd2x       IN4+32, 28, PT
759    lxvd2x       IN5+32, 29, PT
760    lxvd2x       IN6+32, 30, PT
761    lxvd2x       IN7+32, 31, PT
762
763    vperm        CTR0, CTR0, CTR0, SWAP_MASK
764    vperm        CTR1, CTR1, CTR1, SWAP_MASK
765    vperm        CTR2, CTR2, CTR2, SWAP_MASK
766    vperm        CTR3, CTR3, CTR3, SWAP_MASK
767    vperm        CTR4, CTR4, CTR4, SWAP_MASK
768    vperm        CTR5, CTR5, CTR5, SWAP_MASK
769    vperm        CTR6, CTR6, CTR6, SWAP_MASK
770    vperm        CTR7, CTR7, CTR7, SWAP_MASK
771
772    vxor         IN0, IN0, CTR0
773    vxor         IN1, IN1, CTR1
774    vxor         IN2, IN2, CTR2
775    vxor         IN3, IN3, CTR3
776    vxor         IN4, IN4, CTR4
777    vxor         IN5, IN5, CTR5
778    vxor         IN6, IN6, CTR6
779    vxor         IN7, IN7, CTR7
780
781    stxvd2x      IN0+32, 0,  CT
782    stxvd2x      IN1+32, 25, CT
783    stxvd2x      IN2+32, 26, CT
784    stxvd2x      IN3+32, 27, CT
785    stxvd2x      IN4+32, 28, CT
786    stxvd2x      IN5+32, 29, CT
787    stxvd2x      IN6+32, 30, CT
788    stxvd2x      IN7+32, 31, CT
789
790    vadduwm      CTR, CTR, I8
791    addi         PT, PT, 0x80
792    addi         CT, CT, 0x80
793    bdnz         .L8x_loop
794
795    clrldi       LEN, LEN, 57
796
797.Lctr_4x:
798    srdi.        9, LEN, 6
799    beq          .Lctr_2x
800
801    li           10, 0
802    li           29, 0x10
803    li           30, 0x20
804    li           31, 0x30
805
806    VEC_LOAD_INC K, KS, 10
807
808    vadduwm      CTR1, CTR, I1
809    vadduwm      CTR2, CTR, I2
810    vadduwm      CTR3, CTR, I3
811
812    vxor         CTR0, CTR,  K
813    vxor         CTR1, CTR1, K
814    vxor         CTR2, CTR2, K
815    vxor         CTR3, CTR3, K
816
817    ROUND_4
818    ROUND_4
819    ROUND_4
820    ROUND_4
821    ROUND_4
822    ROUND_4
823    ROUND_4
824    ROUND_4
825    ROUND_4
826    cmpwi        NR, 10
827    beq          .Llast_4
828    ROUND_4
829    ROUND_4
830    cmpwi        NR, 12
831    beq          .Llast_4
832    ROUND_4
833    ROUND_4
834
835.Llast_4:
836    VEC_LOAD     K, KS, 10
837    vcipherlast  CTR0, CTR0, K
838    vcipherlast  CTR1, CTR1, K
839    vcipherlast  CTR2, CTR2, K
840    vcipherlast  CTR3, CTR3, K
841
842    lxvd2x       IN0+32, 0,  PT
843    lxvd2x       IN1+32, 29, PT
844    lxvd2x       IN2+32, 30, PT
845    lxvd2x       IN3+32, 31, PT
846
847    vperm        CTR0, CTR0, CTR0, SWAP_MASK
848    vperm        CTR1, CTR1, CTR1, SWAP_MASK
849    vperm        CTR2, CTR2, CTR2, SWAP_MASK
850    vperm        CTR3, CTR3, CTR3, SWAP_MASK
851
852    vxor         IN0, IN0, CTR0
853    vxor         IN1, IN1, CTR1
854    vxor         IN2, IN2, CTR2
855    vxor         IN3, IN3, CTR3
856
857    stxvd2x      IN0+32, 0,  CT
858    stxvd2x      IN1+32, 29, CT
859    stxvd2x      IN2+32, 30, CT
860    stxvd2x      IN3+32, 31, CT
861
862    vadduwm      CTR, CTR, I4
863    addi         PT, PT, 0x40
864    addi         CT, CT, 0x40
865
866    clrldi       LEN, LEN, 58
867
868.Lctr_2x:
869    srdi.        9, LEN, 5
870    beq          .Lctr_1x
871
872    li           10, 0
873    li           31, 0x10
874
875    VEC_LOAD_INC K, KS, 10
876
877    vadduwm      CTR1, CTR, I1
878
879    vxor         CTR0, CTR,  K
880    vxor         CTR1, CTR1, K
881
882    ROUND_2
883    ROUND_2
884    ROUND_2
885    ROUND_2
886    ROUND_2
887    ROUND_2
888    ROUND_2
889    ROUND_2
890    ROUND_2
891    cmpwi        NR, 10
892    beq          .Llast_2
893    ROUND_2
894    ROUND_2
895    cmpwi        NR, 12
896    beq          .Llast_2
897    ROUND_2
898    ROUND_2
899
900.Llast_2:
901    VEC_LOAD     K, KS, 10
902    vcipherlast  CTR0, CTR0, K
903    vcipherlast  CTR1, CTR1, K
904
905    lxvd2x       IN0+32, 0,  PT
906    lxvd2x       IN1+32, 31, PT
907
908    vperm        CTR0, CTR0, CTR0, SWAP_MASK
909    vperm        CTR1, CTR1, CTR1, SWAP_MASK
910
911    vxor         IN0, IN0, CTR0
912    vxor         IN1, IN1, CTR1
913
914    stxvd2x      IN0+32, 0,  CT
915    stxvd2x      IN1+32, 31, CT
916
917    vadduwm      CTR, CTR, I2
918    addi         PT, PT, 0x20
919    addi         CT, CT, 0x20
920
921    clrldi       LEN, LEN, 59
922
923.Lctr_1x:
924    srdi.        9, LEN, 4
925    beq          .Lctr_tail
926
927    li           10, 0
928
929    VEC_LOAD_INC K, KS, 10
930    vxor         CTR0, CTR,  K
931
932    ROUND_1
933    ROUND_1
934    ROUND_1
935    ROUND_1
936    ROUND_1
937    ROUND_1
938    ROUND_1
939    ROUND_1
940    ROUND_1
941    cmpwi        NR, 10
942    beq          .Llast_1
943    ROUND_1
944    ROUND_1
945    cmpwi        NR, 12
946    beq          .Llast_1
947    ROUND_1
948    ROUND_1
949
950.Llast_1:
951    VEC_LOAD     K, KS, 10
952    vcipherlast  CTR0, CTR0, K
953
954    lxvd2x       IN0+32, 0, PT
955
956    vperm        CTR0, CTR0, CTR0, SWAP_MASK
957
958    vxor         IN0, IN0, CTR0
959
960    stxvd2x      IN0+32, 0, CT
961
962    vadduwm      CTR, CTR, I1
963    addi         PT, PT, 0x10
964    addi         CT, CT, 0x10
965
966    clrldi       LEN, LEN, 60
967
968.Lctr_tail:
969    cmpldi       LEN, 0
970    beq          .Lc_done
971
972    li           10, 0
973
974    VEC_LOAD_INC K, KS, 10
975    vxor         CTR0, CTR,  K
976
977    ROUND_1
978    ROUND_1
979    ROUND_1
980    ROUND_1
981    ROUND_1
982    ROUND_1
983    ROUND_1
984    ROUND_1
985    ROUND_1
986    cmpwi        NR, 10
987    beq          .Llast_tail
988    ROUND_1
989    ROUND_1
990    cmpwi        NR, 12
991    beq          .Llast_tail
992    ROUND_1
993    ROUND_1
994
995.Llast_tail:
996    VEC_LOAD     K, KS, 10
997    vcipherlast  CTR0, CTR0, K
998
999    LOAD_LEN     PT, LEN, 10, 9, 29, 30, 31
1000
1001    vsldoi       CTR1, CTR0, CTR0, 8
1002    mfvrd        31, CTR0
1003    mfvrd        30, CTR1
1004
1005    xor          10, 10, 31
1006    xor          9, 9, 30
1007
1008    STORE_LEN    CT, LEN, 10, 9, 29, 30, 31
1009
1010    vadduwm      CTR, CTR, I1
1011
1012.Lc_done:
1013    VEC_STORE    CTR, CTRP, 0
1014
1015    # restore non-volatile vector registers
1016    addi         9, SP, -80
1017    lvx          31, 0, 9
1018    addi         9, SP, -96
1019    lvx          30, 0, 9
1020    addi         9, SP, -112
1021    lvx          29, 0, 9
1022    addi         9, SP, -128
1023    lvx          28, 0, 9
1024    addi         9, SP, -144
1025    lvx          27, 0, 9
1026    addi         9, SP, -160
1027    lvx          26, 0, 9
1028    addi         9, SP, -176
1029    lvx          25, 0, 9
1030    addi         9, SP, -192
1031    lvx          24, 0, 9
1032
1033    # restore non-volatile general registers
1034    ld           31,-8(SP);
1035    ld           30,-16(SP);
1036    ld           29,-24(SP);
1037    ld           28,-32(SP);
1038    ld           27,-40(SP);
1039    ld           26,-48(SP);
1040    ld           25,-56(SP);
1041    blr
1042.size ppc_aes_gcmCRYPT, . - ppc_aes_gcmCRYPT
1043
1044.data
1045.align	4
1046.Lpoly:
1047	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1048.Lpoly_r:
1049    .byte	0,0,0,0,0,0,0,0xc2,0,0,0,0,0,0,0,0
1050.Ldb_bswap_mask:
1051	.byte	8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
1052