1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# February 2012
18#
19# The module implements bn_GF2m_mul_2x2 polynomial multiplication
20# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
21# C for the time being... The subroutine runs in 37 cycles, which is
22# 4.5x faster than compiler-generated code. Though comparison is
23# totally unfair, because this module utilizes Galois Field Multiply
24# instruction.
25
26while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
27open STDOUT,">$output";
28
29($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
30
31($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
32($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
33($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
34($A,$B)=($Alo,$B_1);
35$xFF="B1";
36
37sub mul_1x1_upper {
38my ($A,$B)=@_;
39$code.=<<___;
40	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
41||	AND	$B,$xFF,$B_0
42||	SHRU	$B,24,$B_3
43	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
44||	EXTU	$A,16,16,$Alo
45
46	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits multiplication
47||	XORMPY	$Ahi,$B_2,$Ahix2
48||	EXTU	$B,16,24,$B_1
49	XORMPY	$Alo,$B_0,$Alox0
50||	XORMPY	$Ahi,$B_0,$Ahix0
51	XORMPY	$Alo,$B_3,$Alox3
52||	XORMPY	$Ahi,$B_3,$Ahix3
53	XORMPY	$Alo,$B_1,$Alox1
54||	XORMPY	$Ahi,$B_1,$Ahix1
55___
56}
57sub mul_1x1_merged {
58my ($OUTlo,$OUThi,$A,$B)=@_;
59$code.=<<___;
60	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
61||	 AND	$B,$xFF,$B_0
62||	 SHRU	$B,24,$B_3
63	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
64||	 EXTU	$A,16,16,$Alo
65
66	XOR	$Ahix0,$Alox2,$Ahix0
67||	MV	$Ahix2,$OUThi
68||	 XORMPY	$Alo,$B_2,$Alox2
69	 XORMPY	$Ahi,$B_2,$Ahix2
70||	 EXTU	$B,16,24,$B_1
71||	 XORMPY	$Alo,$B_0,A1		; $Alox0
72	XOR	$Ahix1,$Alox3,$Ahix1
73||	SHL	$Ahix0,16,$OUTlo
74||	SHRU	$Ahix0,16,$Ahix0
75	XOR	$Alox0,$OUTlo,$OUTlo
76||	XOR	$Ahix0,$OUThi,$OUThi
77||	 XORMPY	$Ahi,$B_0,$Ahix0
78||	 XORMPY	$Alo,$B_3,$Alox3
79||	SHL	$Alox1,8,$Alox1
80||	SHL	$Ahix3,8,$Ahix3
81	XOR	$Alox1,$OUTlo,$OUTlo
82||	XOR	$Ahix3,$OUThi,$OUThi
83||	 XORMPY	$Ahi,$B_3,$Ahix3
84||	SHL	$Ahix1,24,$Alox1
85||	SHRU	$Ahix1,8, $Ahix1
86	XOR	$Alox1,$OUTlo,$OUTlo
87||	XOR	$Ahix1,$OUThi,$OUThi
88||	 XORMPY	$Alo,$B_1,$Alox1
89||	 XORMPY	$Ahi,$B_1,$Ahix1
90||	 MV	A1,$Alox0
91___
92}
93sub mul_1x1_lower {
94my ($OUTlo,$OUThi)=@_;
95$code.=<<___;
96	;NOP
97	XOR	$Ahix0,$Alox2,$Ahix0
98||	MV	$Ahix2,$OUThi
99	NOP
100	XOR	$Ahix1,$Alox3,$Ahix1
101||	SHL	$Ahix0,16,$OUTlo
102||	SHRU	$Ahix0,16,$Ahix0
103	XOR	$Alox0,$OUTlo,$OUTlo
104||	XOR	$Ahix0,$OUThi,$OUThi
105||	SHL	$Alox1,8,$Alox1
106||	SHL	$Ahix3,8,$Ahix3
107	XOR	$Alox1,$OUTlo,$OUTlo
108||	XOR	$Ahix3,$OUThi,$OUThi
109||	SHL	$Ahix1,24,$Alox1
110||	SHRU	$Ahix1,8, $Ahix1
111	XOR	$Alox1,$OUTlo,$OUTlo
112||	XOR	$Ahix1,$OUThi,$OUThi
113___
114}
115$code.=<<___;
116	.text
117
118	.if	.ASSEMBLER_VERSION<7000000
119	.asg	0,__TI_EABI__
120	.endif
121	.if	__TI_EABI__
122	.asg	bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
123	.endif
124
125	.global	_bn_GF2m_mul_2x2
126_bn_GF2m_mul_2x2:
127	.asmfunc
128	MVK	0xFF,$xFF
129___
130	&mul_1x1_upper($a0,$b0);		# a0·b0
131$code.=<<___;
132||	MV	$b1,$B
133	MV	$a1,$A
134___
135	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
136$code.=<<___;
137||	XOR	$b0,$b1,$B
138	XOR	$a0,$a1,$A
139___
140	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
141$code.=<<___;
142	XOR	A28,A31,A29
143||	XOR	B28,B31,B29			; a0·b0+a1·b1
144___
145	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
146$code.=<<___;
147||	BNOP	B3
148	XOR	A29,A30,A30
149||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
150	XOR	B28,A30,A30
151||	STW	A28,*${rp}[0]
152	XOR	B30,A31,A31
153||	STW	A30,*${rp}[1]
154	STW	A31,*${rp}[2]
155	STW	B31,*${rp}[3]
156	.endasmfunc
157___
158
159print $code;
160close STDOUT or die "error closing STDOUT: $!";
161