1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# February 2012
18#
19# The module implements bn_GF2m_mul_2x2 polynomial multiplication
20# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
21# C for the time being... The subroutine runs in 37 cycles, which is
22# 4.5x faster than compiler-generated code. Though comparison is
23# totally unfair, because this module utilizes Galois Field Multiply
24# instruction.
25
26$output = pop and open STDOUT,">$output";
27
28($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
29
30($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
31($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
32($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
33($A,$B)=($Alo,$B_1);
34$xFF="B1";
35
36sub mul_1x1_upper {
37my ($A,$B)=@_;
38$code.=<<___;
39	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
40||	AND	$B,$xFF,$B_0
41||	SHRU	$B,24,$B_3
42	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
43||	EXTU	$A,16,16,$Alo
44
45	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits multiplication
46||	XORMPY	$Ahi,$B_2,$Ahix2
47||	EXTU	$B,16,24,$B_1
48	XORMPY	$Alo,$B_0,$Alox0
49||	XORMPY	$Ahi,$B_0,$Ahix0
50	XORMPY	$Alo,$B_3,$Alox3
51||	XORMPY	$Ahi,$B_3,$Ahix3
52	XORMPY	$Alo,$B_1,$Alox1
53||	XORMPY	$Ahi,$B_1,$Ahix1
54___
55}
56sub mul_1x1_merged {
57my ($OUTlo,$OUThi,$A,$B)=@_;
58$code.=<<___;
59	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
60||	 AND	$B,$xFF,$B_0
61||	 SHRU	$B,24,$B_3
62	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
63||	 EXTU	$A,16,16,$Alo
64
65	XOR	$Ahix0,$Alox2,$Ahix0
66||	MV	$Ahix2,$OUThi
67||	 XORMPY	$Alo,$B_2,$Alox2
68	 XORMPY	$Ahi,$B_2,$Ahix2
69||	 EXTU	$B,16,24,$B_1
70||	 XORMPY	$Alo,$B_0,A1		; $Alox0
71	XOR	$Ahix1,$Alox3,$Ahix1
72||	SHL	$Ahix0,16,$OUTlo
73||	SHRU	$Ahix0,16,$Ahix0
74	XOR	$Alox0,$OUTlo,$OUTlo
75||	XOR	$Ahix0,$OUThi,$OUThi
76||	 XORMPY	$Ahi,$B_0,$Ahix0
77||	 XORMPY	$Alo,$B_3,$Alox3
78||	SHL	$Alox1,8,$Alox1
79||	SHL	$Ahix3,8,$Ahix3
80	XOR	$Alox1,$OUTlo,$OUTlo
81||	XOR	$Ahix3,$OUThi,$OUThi
82||	 XORMPY	$Ahi,$B_3,$Ahix3
83||	SHL	$Ahix1,24,$Alox1
84||	SHRU	$Ahix1,8, $Ahix1
85	XOR	$Alox1,$OUTlo,$OUTlo
86||	XOR	$Ahix1,$OUThi,$OUThi
87||	 XORMPY	$Alo,$B_1,$Alox1
88||	 XORMPY	$Ahi,$B_1,$Ahix1
89||	 MV	A1,$Alox0
90___
91}
92sub mul_1x1_lower {
93my ($OUTlo,$OUThi)=@_;
94$code.=<<___;
95	;NOP
96	XOR	$Ahix0,$Alox2,$Ahix0
97||	MV	$Ahix2,$OUThi
98	NOP
99	XOR	$Ahix1,$Alox3,$Ahix1
100||	SHL	$Ahix0,16,$OUTlo
101||	SHRU	$Ahix0,16,$Ahix0
102	XOR	$Alox0,$OUTlo,$OUTlo
103||	XOR	$Ahix0,$OUThi,$OUThi
104||	SHL	$Alox1,8,$Alox1
105||	SHL	$Ahix3,8,$Ahix3
106	XOR	$Alox1,$OUTlo,$OUTlo
107||	XOR	$Ahix3,$OUThi,$OUThi
108||	SHL	$Ahix1,24,$Alox1
109||	SHRU	$Ahix1,8, $Ahix1
110	XOR	$Alox1,$OUTlo,$OUTlo
111||	XOR	$Ahix1,$OUThi,$OUThi
112___
113}
114$code.=<<___;
115	.text
116
117	.if	.ASSEMBLER_VERSION<7000000
118	.asg	0,__TI_EABI__
119	.endif
120	.if	__TI_EABI__
121	.asg	bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
122	.endif
123
124	.global	_bn_GF2m_mul_2x2
125_bn_GF2m_mul_2x2:
126	.asmfunc
127	MVK	0xFF,$xFF
128___
129	&mul_1x1_upper($a0,$b0);		# a0·b0
130$code.=<<___;
131||	MV	$b1,$B
132	MV	$a1,$A
133___
134	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
135$code.=<<___;
136||	XOR	$b0,$b1,$B
137	XOR	$a0,$a1,$A
138___
139	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
140$code.=<<___;
141	XOR	A28,A31,A29
142||	XOR	B28,B31,B29			; a0·b0+a1·b1
143___
144	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
145$code.=<<___;
146||	BNOP	B3
147	XOR	A29,A30,A30
148||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
149	XOR	B28,A30,A30
150||	STW	A28,*${rp}[0]
151	XOR	B30,A31,A31
152||	STW	A30,*${rp}[1]
153	STW	A31,*${rp}[2]
154	STW	B31,*${rp}[3]
155	.endasmfunc
156___
157
158print $code;
159close STDOUT or die "error closing STDOUT: $!";
160