1; AMD64 mpn_submul_1 optimised for Intel Haswell. 2 3; Contributed to the GNU project by Torbjörn Granlund. 4; Converted to MPIR by Alexander Kruppa. 5 6; Copyright 2013 Free Software Foundation, Inc. 7 8; This file is part of the GNU MP Library. 9; 10; The GNU MP Library is free software; you can redistribute it and/or modify 11; it under the terms of either: 12; 13; * the GNU Lesser General Public License as published by the Free 14; Software Foundation; either version 3 of the License, or (at your 15; option) any later version. 16; 17; or 18; 19; * the GNU General Public License as published by the Free Software 20; Foundation; either version 2 of the License, or (at your option) any 21; later version. 22; 23; or both in parallel, as here. 24; 25; The GNU MP Library is distributed in the hope that it will be useful, but 26; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28; for more details. 29; 30; You should have received copies of the GNU General Public License and the 31; GNU Lesser General Public License along with the GNU MP Library. If not, 32; see https://www.gnu.org/licenses/. 33; mp_limb_t mpn_submul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t) 34; rax rdi rsi rdx rcx 35; rax rcx rdx r8 r9d 36 37%include 'yasm_mac.inc' 38 39BITS 64 40 41%define reg_save_list rbx, rbp, rsi, rdi, r12, r13 42%define RP rdi 43%define S1P rsi 44%define Size rbp 45%define Sizeb bpl 46%define Limb rcx 47 48%define Tmp0 r12 49%define Tmp1 r13 50%define Tmp2 rax 51%define Tmp3 rbx 52%define Tmp4 r8 53%define Tmp5 r9 54%define Tmp6 r10 55%define Tmp7 r11 56%define Tmp8 rcx 57 58%define ADDSUB sub 59%define ADCSBB sbb 60 61align 16 62 63FRAME_PROC mpn_submul_1, 0, reg_save_list 64 mov rdi, rcx 65 mov rsi, rdx 66 mov rbp, r8 ; mulx requires one input in rdx 67 mov rdx, r9 68 69 test Sizeb, 1 70 jnz .Lbx1 71 72.Lbx0: shr Size, 2 73 jc .Lb10 ;ajs:notshortform 74 75.Lb00: mulx Tmp0, Tmp1, [S1P] 76 mulx Tmp2, Tmp3, [S1P+8] 77 add Tmp3, Tmp0 78 adc Tmp2, 0 79 mov Tmp0, [RP] 80 mov Tmp8, [RP+8] 81 mulx Tmp4, Tmp5, [S1P+16] 82 lea RP, [RP-16] 83 lea S1P, [S1P+16] 84 ADDSUB Tmp0, Tmp1 85 jmp .Llo0 ;ajs:notshortform 86 87.Lbx1: shr Size, 2 88 jc .Lb11 89 90.Lb01: mulx Tmp6, Tmp7, [S1P] 91 jnz .Lgt1 92.Ln1: ADDSUB [RP], Tmp7 93 mov eax, 0 94 adc Tmp2, Tmp6 95 jmp .Lret ;ajs:notshortform 96 97.Lgt1: mulx Tmp0, Tmp1, [S1P+8] 98 mulx Tmp2, Tmp3, [S1P+16] 99 lea S1P, [S1P+24] 100 add Tmp1, Tmp6 101 adc Tmp3, Tmp0 102 adc Tmp2, 0 103 mov Tmp6, [RP] 104 mov Tmp0, [RP+8] 105 mov Tmp8, [RP+16] 106 lea RP, [RP-8] 107 ADDSUB Tmp6, Tmp7 108 jmp .Llo1 109 110.Lb11: mulx Tmp2, Tmp3, [S1P] 111 mov Tmp8, [RP] 112 mulx Tmp4, Tmp5, [S1P+8] 113 lea S1P, [S1P+8] 114 lea RP, [RP-24] 115 inc Size 116 ADDSUB Tmp8, Tmp3 117 jmp .Llo3 118 119.Lb10: mulx Tmp4, Tmp5, [S1P] 120 mulx Tmp6, Tmp7, [S1P+8] 121 lea RP, [RP-32] 122 mov eax, 0 123 clc 124 jz .Lend ;ajs:notshortform 125 126 align 16 127.Ltop: adc Tmp5, Tmp2 128 lea RP, [RP+32] 129 adc Tmp7, Tmp4 130 mulx Tmp0, Tmp1, [S1P+16] 131 mov Tmp4, [RP] 132 mulx Tmp2, Tmp3, [S1P+24] 133 lea S1P, [S1P+32] 134 adc Tmp1, Tmp6 135 adc Tmp3, Tmp0 136 adc Tmp2, 0 137 mov Tmp6, [RP+8] 138 mov Tmp0, [RP+16] 139 ADDSUB Tmp4, Tmp5 140 mov Tmp8, [RP+24] 141 mov [RP], Tmp4 142 ADCSBB Tmp6, Tmp7 143.Llo1: mulx Tmp4, Tmp5, [S1P] 144 mov [RP+8], Tmp6 145 ADCSBB Tmp0, Tmp1 146.Llo0: mov [RP+16], Tmp0 147 ADCSBB Tmp8, Tmp3 148.Llo3: mulx Tmp6, Tmp7, [S1P+8] 149 mov [RP+24], Tmp8 150 dec Size 151 jnz .Ltop 152 153.Lend: adc Tmp5, Tmp2 154 adc Tmp7, Tmp4 155 mov Tmp4, [RP+32] 156 mov Tmp2, Tmp6 157 adc Tmp2, 0 158 mov Tmp6, [RP+40] 159 ADDSUB Tmp4, Tmp5 160 mov [RP+32], Tmp4 161 ADCSBB Tmp6, Tmp7 162 mov [RP+40], Tmp6 163 adc Tmp2, 0 164 165.Lret: 166 END_PROC reg_save_list 167