mpn/x86_64/cnd_aors_n.asm

*f81b1c5bSmrgdnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
*f81b1c5bSmrg
*f81b1c5bSmrgdnl  Copyright 2011-2013 Free Software Foundation, Inc.
*f81b1c5bSmrg
*f81b1c5bSmrgdnl  This file is part of the GNU MP Library.
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
*f81b1c5bSmrgdnl  it under the terms of either:
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
*f81b1c5bSmrgdnl      option) any later version.
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl  or
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
*f81b1c5bSmrgdnl      later version.
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl  or both in parallel, as here.
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
*f81b1c5bSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
*f81b1c5bSmrgdnl  for more details.
*f81b1c5bSmrgdnl
*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
*f81b1c5bSmrg
*f81b1c5bSmrginclude(`../config.m4')
*f81b1c5bSmrg
*f81b1c5bSmrgC	     cycles/limb
*f81b1c5bSmrgC AMD K8,K9	 2
*f81b1c5bSmrgC AMD K10	 2
*f81b1c5bSmrgC AMD bd1	 2.32
*f81b1c5bSmrgC AMD bobcat	 3
*f81b1c5bSmrgC Intel P4	13
*f81b1c5bSmrgC Intel core2	 2.9
*f81b1c5bSmrgC Intel NHM	 2.8
*f81b1c5bSmrgC Intel SBR	 2.4
*f81b1c5bSmrgC Intel atom	 5.33
*f81b1c5bSmrgC VIA nano	 3
*f81b1c5bSmrg
*f81b1c5bSmrgC NOTES
*f81b1c5bSmrgC  * It might seem natural to use the cmov insn here, but since this function
*f81b1c5bSmrgC    is supposed to have the exact same execution pattern for cnd true and
*f81b1c5bSmrgC    false, and since cmov's documentation is not clear about whether it
*f81b1c5bSmrgC    actually reads both source operands and writes the register for a false
*f81b1c5bSmrgC    condition, we cannot use it.
*f81b1c5bSmrgC  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
*f81b1c5bSmrgC    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
*f81b1c5bSmrgC    ADCSBB-to-memory, again saving 1 insn/limb.
*f81b1c5bSmrgC  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
*f81b1c5bSmrgC    for any other processor.
*f81b1c5bSmrg
*f81b1c5bSmrgC INPUT PARAMETERS
*f81b1c5bSmrgdefine(`cnd',	`%rdi')	dnl rcx
*f81b1c5bSmrgdefine(`rp',	`%rsi')	dnl rdx
*f81b1c5bSmrgdefine(`up',	`%rdx')	dnl r8
*f81b1c5bSmrgdefine(`vp',	`%rcx')	dnl r9
*f81b1c5bSmrgdefine(`n',	`%r8')	dnl rsp+40
*f81b1c5bSmrg
*f81b1c5bSmrgifdef(`OPERATION_cnd_add_n', `
*f81b1c5bSmrg	define(ADDSUB,	      add)
*f81b1c5bSmrg	define(ADCSBB,	      adc)
*f81b1c5bSmrg	define(func,	      mpn_cnd_add_n)')
*f81b1c5bSmrgifdef(`OPERATION_cnd_sub_n', `
*f81b1c5bSmrg	define(ADDSUB,	      sub)
*f81b1c5bSmrg	define(ADCSBB,	      sbb)
*f81b1c5bSmrg	define(func,	      mpn_cnd_sub_n)')
*f81b1c5bSmrg
*f81b1c5bSmrgMULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
*f81b1c5bSmrg
*f81b1c5bSmrgABI_SUPPORT(DOS64)
*f81b1c5bSmrgABI_SUPPORT(STD64)
*f81b1c5bSmrg
*f81b1c5bSmrgASM_START()
*f81b1c5bSmrg	TEXT
*f81b1c5bSmrg	ALIGN(16)
*f81b1c5bSmrgPROLOGUE(func)
*f81b1c5bSmrg	FUNC_ENTRY(4)
*f81b1c5bSmrgIFDOS(`	mov	56(%rsp), R32(%r8)')
*f81b1c5bSmrg	push	%rbx
*f81b1c5bSmrg	push	%rbp
*f81b1c5bSmrg	push	%r12
*f81b1c5bSmrg	push	%r13
*f81b1c5bSmrg	push	%r14
*f81b1c5bSmrg
*f81b1c5bSmrg	neg	cnd
*f81b1c5bSmrg	sbb	cnd, cnd		C make cnd mask
*f81b1c5bSmrg
*f81b1c5bSmrg	lea	(vp,n,8), vp
*f81b1c5bSmrg	lea	(up,n,8), up
*f81b1c5bSmrg	lea	(rp,n,8), rp
*f81b1c5bSmrg
*f81b1c5bSmrg	mov	R32(n), R32(%rax)
*f81b1c5bSmrg	neg	n
*f81b1c5bSmrg	and	$3, R32(%rax)
*f81b1c5bSmrg	jz	L(top)			C carry-save reg rax = 0 in this arc
*f81b1c5bSmrg	cmp	$2, R32(%rax)
*f81b1c5bSmrg	jc	L(b1)
*f81b1c5bSmrg	jz	L(b2)
*f81b1c5bSmrg
*f81b1c5bSmrgL(b3):	mov	(vp,n,8), %r12
*f81b1c5bSmrg	mov	8(vp,n,8), %r13
*f81b1c5bSmrg	mov	16(vp,n,8), %r14
*f81b1c5bSmrg	and	cnd, %r12
*f81b1c5bSmrg	mov	(up,n,8), %r10
*f81b1c5bSmrg	and	cnd, %r13
*f81b1c5bSmrg	mov	8(up,n,8), %rbx
*f81b1c5bSmrg	and	cnd, %r14
*f81b1c5bSmrg	mov	16(up,n,8), %rbp
*f81b1c5bSmrg	ADDSUB	%r12, %r10
*f81b1c5bSmrg	mov	%r10, (rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r13, %rbx
*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r14, %rbp
*f81b1c5bSmrg	mov	%rbp, 16(rp,n,8)
*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
*f81b1c5bSmrg	add	$3, n
*f81b1c5bSmrg	js	L(top)
*f81b1c5bSmrg	jmp	L(end)
*f81b1c5bSmrg
*f81b1c5bSmrgL(b2):	mov	(vp,n,8), %r12
*f81b1c5bSmrg	mov	8(vp,n,8), %r13
*f81b1c5bSmrg	mov	(up,n,8), %r10
*f81b1c5bSmrg	and	cnd, %r12
*f81b1c5bSmrg	mov	8(up,n,8), %rbx
*f81b1c5bSmrg	and	cnd, %r13
*f81b1c5bSmrg	ADDSUB	%r12, %r10
*f81b1c5bSmrg	mov	%r10, (rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r13, %rbx
*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
*f81b1c5bSmrg	add	$2, n
*f81b1c5bSmrg	js	L(top)
*f81b1c5bSmrg	jmp	L(end)
*f81b1c5bSmrg
*f81b1c5bSmrgL(b1):	mov	(vp,n,8), %r12
*f81b1c5bSmrg	mov	(up,n,8), %r10
*f81b1c5bSmrg	and	cnd, %r12
*f81b1c5bSmrg	ADDSUB	%r12, %r10
*f81b1c5bSmrg	mov	%r10, (rp,n,8)
*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
*f81b1c5bSmrg	add	$1, n
*f81b1c5bSmrg	jns	L(end)
*f81b1c5bSmrg
*f81b1c5bSmrg	ALIGN(16)
*f81b1c5bSmrgL(top):	mov	(vp,n,8), %r12
*f81b1c5bSmrg	mov	8(vp,n,8), %r13
*f81b1c5bSmrg	mov	16(vp,n,8), %r14
*f81b1c5bSmrg	mov	24(vp,n,8), %r11
*f81b1c5bSmrg	and	cnd, %r12
*f81b1c5bSmrg	mov	(up,n,8), %r10
*f81b1c5bSmrg	and	cnd, %r13
*f81b1c5bSmrg	mov	8(up,n,8), %rbx
*f81b1c5bSmrg	and	cnd, %r14
*f81b1c5bSmrg	mov	16(up,n,8), %rbp
*f81b1c5bSmrg	and	cnd, %r11
*f81b1c5bSmrg	mov	24(up,n,8), %r9
*f81b1c5bSmrg	add	R32(%rax), R32(%rax)	C restore carry
*f81b1c5bSmrg	ADCSBB	%r12, %r10
*f81b1c5bSmrg	mov	%r10, (rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r13, %rbx
*f81b1c5bSmrg	mov	%rbx, 8(rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r14, %rbp
*f81b1c5bSmrg	mov	%rbp, 16(rp,n,8)
*f81b1c5bSmrg	ADCSBB	%r11, %r9
*f81b1c5bSmrg	mov	%r9, 24(rp,n,8)
*f81b1c5bSmrg	sbb	R32(%rax), R32(%rax)	C save carry
*f81b1c5bSmrg	add	$4, n
*f81b1c5bSmrg	js	L(top)
*f81b1c5bSmrg
*f81b1c5bSmrgL(end):	neg	R32(%rax)
*f81b1c5bSmrg	pop	%r14
*f81b1c5bSmrg	pop	%r13
*f81b1c5bSmrg	pop	%r12
*f81b1c5bSmrg	pop	%rbp
*f81b1c5bSmrg	pop	%rbx
*f81b1c5bSmrg	FUNC_EXIT()
*f81b1c5bSmrg	ret
*f81b1c5bSmrgEPILOGUE()