1*ec02198aSmrg/* Copyright (C) 2000-2020 Free Software Foundation, Inc. 263d1a8abSmrg Contributed by James E. Wilson <wilson@cygnus.com>. 363d1a8abSmrg 463d1a8abSmrg This file is part of GCC. 563d1a8abSmrg 663d1a8abSmrg GCC is free software; you can redistribute it and/or modify 763d1a8abSmrg it under the terms of the GNU General Public License as published by 863d1a8abSmrg the Free Software Foundation; either version 3, or (at your option) 963d1a8abSmrg any later version. 1063d1a8abSmrg 1163d1a8abSmrg GCC is distributed in the hope that it will be useful, 1263d1a8abSmrg but WITHOUT ANY WARRANTY; without even the implied warranty of 1363d1a8abSmrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1463d1a8abSmrg GNU General Public License for more details. 1563d1a8abSmrg 1663d1a8abSmrg Under Section 7 of GPL version 3, you are granted additional 1763d1a8abSmrg permissions described in the GCC Runtime Library Exception, version 1863d1a8abSmrg 3.1, as published by the Free Software Foundation. 1963d1a8abSmrg 2063d1a8abSmrg You should have received a copy of the GNU General Public License and 2163d1a8abSmrg a copy of the GCC Runtime Library Exception along with this program; 2263d1a8abSmrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 2363d1a8abSmrg <http://www.gnu.org/licenses/>. */ 2463d1a8abSmrg 2563d1a8abSmrg#ifdef L__divxf3 2663d1a8abSmrg// Compute a 80-bit IEEE double-extended quotient. 2763d1a8abSmrg// 2863d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 2963d1a8abSmrg// alternative. 3063d1a8abSmrg// 3163d1a8abSmrg// farg0 holds the dividend. farg1 holds the divisor. 3263d1a8abSmrg// 3363d1a8abSmrg// __divtf3 is an alternate symbol name for backward compatibility. 3463d1a8abSmrg 3563d1a8abSmrg .text 3663d1a8abSmrg .align 16 3763d1a8abSmrg .global __divxf3 3863d1a8abSmrg .proc __divxf3 3963d1a8abSmrg__divxf3: 4063d1a8abSmrg#ifdef SHARED 4163d1a8abSmrg .global __divtf3 4263d1a8abSmrg__divtf3: 4363d1a8abSmrg#endif 4463d1a8abSmrg cmp.eq p7, p0 = r0, r0 4563d1a8abSmrg frcpa.s0 f10, p6 = farg0, farg1 4663d1a8abSmrg ;; 4763d1a8abSmrg(p6) cmp.ne p7, p0 = r0, r0 4863d1a8abSmrg .pred.rel.mutex p6, p7 4963d1a8abSmrg(p6) fnma.s1 f11 = farg1, f10, f1 5063d1a8abSmrg(p6) fma.s1 f12 = farg0, f10, f0 5163d1a8abSmrg ;; 5263d1a8abSmrg(p6) fma.s1 f13 = f11, f11, f0 5363d1a8abSmrg(p6) fma.s1 f14 = f11, f11, f11 5463d1a8abSmrg ;; 5563d1a8abSmrg(p6) fma.s1 f11 = f13, f13, f11 5663d1a8abSmrg(p6) fma.s1 f13 = f14, f10, f10 5763d1a8abSmrg ;; 5863d1a8abSmrg(p6) fma.s1 f10 = f13, f11, f10 5963d1a8abSmrg(p6) fnma.s1 f11 = farg1, f12, farg0 6063d1a8abSmrg ;; 6163d1a8abSmrg(p6) fma.s1 f11 = f11, f10, f12 6263d1a8abSmrg(p6) fnma.s1 f12 = farg1, f10, f1 6363d1a8abSmrg ;; 6463d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f10 6563d1a8abSmrg(p6) fnma.s1 f12 = farg1, f11, farg0 6663d1a8abSmrg ;; 6763d1a8abSmrg(p6) fma.s0 fret0 = f12, f10, f11 6863d1a8abSmrg(p7) mov fret0 = f10 6963d1a8abSmrg br.ret.sptk rp 7063d1a8abSmrg .endp __divxf3 7163d1a8abSmrg#endif 7263d1a8abSmrg 7363d1a8abSmrg#ifdef L__divdf3 7463d1a8abSmrg// Compute a 64-bit IEEE double quotient. 7563d1a8abSmrg// 7663d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 7763d1a8abSmrg// alternative. 7863d1a8abSmrg// 7963d1a8abSmrg// farg0 holds the dividend. farg1 holds the divisor. 8063d1a8abSmrg 8163d1a8abSmrg .text 8263d1a8abSmrg .align 16 8363d1a8abSmrg .global __divdf3 8463d1a8abSmrg .proc __divdf3 8563d1a8abSmrg__divdf3: 8663d1a8abSmrg cmp.eq p7, p0 = r0, r0 8763d1a8abSmrg frcpa.s0 f10, p6 = farg0, farg1 8863d1a8abSmrg ;; 8963d1a8abSmrg(p6) cmp.ne p7, p0 = r0, r0 9063d1a8abSmrg .pred.rel.mutex p6, p7 9163d1a8abSmrg(p6) fmpy.s1 f11 = farg0, f10 9263d1a8abSmrg(p6) fnma.s1 f12 = farg1, f10, f1 9363d1a8abSmrg ;; 9463d1a8abSmrg(p6) fma.s1 f11 = f12, f11, f11 9563d1a8abSmrg(p6) fmpy.s1 f13 = f12, f12 9663d1a8abSmrg ;; 9763d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f10 9863d1a8abSmrg(p6) fma.s1 f11 = f13, f11, f11 9963d1a8abSmrg ;; 10063d1a8abSmrg(p6) fmpy.s1 f12 = f13, f13 10163d1a8abSmrg(p6) fma.s1 f10 = f13, f10, f10 10263d1a8abSmrg ;; 10363d1a8abSmrg(p6) fma.d.s1 f11 = f12, f11, f11 10463d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f10 10563d1a8abSmrg ;; 10663d1a8abSmrg(p6) fnma.d.s1 f8 = farg1, f11, farg0 10763d1a8abSmrg ;; 10863d1a8abSmrg(p6) fma.d fret0 = f8, f10, f11 10963d1a8abSmrg(p7) mov fret0 = f10 11063d1a8abSmrg br.ret.sptk rp 11163d1a8abSmrg ;; 11263d1a8abSmrg .endp __divdf3 11363d1a8abSmrg#endif 11463d1a8abSmrg 11563d1a8abSmrg#ifdef L__divsf3 11663d1a8abSmrg// Compute a 32-bit IEEE float quotient. 11763d1a8abSmrg// 11863d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 11963d1a8abSmrg// alternative. 12063d1a8abSmrg// 12163d1a8abSmrg// farg0 holds the dividend. farg1 holds the divisor. 12263d1a8abSmrg 12363d1a8abSmrg .text 12463d1a8abSmrg .align 16 12563d1a8abSmrg .global __divsf3 12663d1a8abSmrg .proc __divsf3 12763d1a8abSmrg__divsf3: 12863d1a8abSmrg cmp.eq p7, p0 = r0, r0 12963d1a8abSmrg frcpa.s0 f10, p6 = farg0, farg1 13063d1a8abSmrg ;; 13163d1a8abSmrg(p6) cmp.ne p7, p0 = r0, r0 13263d1a8abSmrg .pred.rel.mutex p6, p7 13363d1a8abSmrg(p6) fmpy.s1 f8 = farg0, f10 13463d1a8abSmrg(p6) fnma.s1 f9 = farg1, f10, f1 13563d1a8abSmrg ;; 13663d1a8abSmrg(p6) fma.s1 f8 = f9, f8, f8 13763d1a8abSmrg(p6) fmpy.s1 f9 = f9, f9 13863d1a8abSmrg ;; 13963d1a8abSmrg(p6) fma.s1 f8 = f9, f8, f8 14063d1a8abSmrg(p6) fmpy.s1 f9 = f9, f9 14163d1a8abSmrg ;; 14263d1a8abSmrg(p6) fma.d.s1 f10 = f9, f8, f8 14363d1a8abSmrg ;; 14463d1a8abSmrg(p6) fnorm.s.s0 fret0 = f10 14563d1a8abSmrg(p7) mov fret0 = f10 14663d1a8abSmrg br.ret.sptk rp 14763d1a8abSmrg ;; 14863d1a8abSmrg .endp __divsf3 14963d1a8abSmrg#endif 15063d1a8abSmrg 15163d1a8abSmrg#ifdef L__divdi3 15263d1a8abSmrg// Compute a 64-bit integer quotient. 15363d1a8abSmrg// 15463d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 15563d1a8abSmrg// alternative. 15663d1a8abSmrg// 15763d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 15863d1a8abSmrg 15963d1a8abSmrg .text 16063d1a8abSmrg .align 16 16163d1a8abSmrg .global __divdi3 16263d1a8abSmrg .proc __divdi3 16363d1a8abSmrg__divdi3: 16463d1a8abSmrg .regstk 2,0,0,0 16563d1a8abSmrg // Transfer inputs to FP registers. 16663d1a8abSmrg setf.sig f8 = in0 16763d1a8abSmrg setf.sig f9 = in1 16863d1a8abSmrg // Check divide by zero. 16963d1a8abSmrg cmp.ne.unc p0,p7=0,in1 17063d1a8abSmrg ;; 17163d1a8abSmrg // Convert the inputs to FP, so that they won't be treated as unsigned. 17263d1a8abSmrg fcvt.xf f8 = f8 17363d1a8abSmrg fcvt.xf f9 = f9 17463d1a8abSmrg(p7) break 1 17563d1a8abSmrg ;; 17663d1a8abSmrg // Compute the reciprocal approximation. 17763d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 17863d1a8abSmrg ;; 17963d1a8abSmrg // 3 Newton-Raphson iterations. 18063d1a8abSmrg(p6) fnma.s1 f11 = f9, f10, f1 18163d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 18263d1a8abSmrg ;; 18363d1a8abSmrg(p6) fmpy.s1 f13 = f11, f11 18463d1a8abSmrg(p6) fma.s1 f12 = f11, f12, f12 18563d1a8abSmrg ;; 18663d1a8abSmrg(p6) fma.s1 f10 = f11, f10, f10 18763d1a8abSmrg(p6) fma.s1 f11 = f13, f12, f12 18863d1a8abSmrg ;; 18963d1a8abSmrg(p6) fma.s1 f10 = f13, f10, f10 19063d1a8abSmrg(p6) fnma.s1 f12 = f9, f11, f8 19163d1a8abSmrg ;; 19263d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f11 19363d1a8abSmrg ;; 19463d1a8abSmrg // Round quotient to an integer. 19563d1a8abSmrg fcvt.fx.trunc.s1 f10 = f10 19663d1a8abSmrg ;; 19763d1a8abSmrg // Transfer result to GP registers. 19863d1a8abSmrg getf.sig ret0 = f10 19963d1a8abSmrg br.ret.sptk rp 20063d1a8abSmrg ;; 20163d1a8abSmrg .endp __divdi3 20263d1a8abSmrg#endif 20363d1a8abSmrg 20463d1a8abSmrg#ifdef L__moddi3 20563d1a8abSmrg// Compute a 64-bit integer modulus. 20663d1a8abSmrg// 20763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 20863d1a8abSmrg// alternative. 20963d1a8abSmrg// 21063d1a8abSmrg// in0 holds the dividend (a). in1 holds the divisor (b). 21163d1a8abSmrg 21263d1a8abSmrg .text 21363d1a8abSmrg .align 16 21463d1a8abSmrg .global __moddi3 21563d1a8abSmrg .proc __moddi3 21663d1a8abSmrg__moddi3: 21763d1a8abSmrg .regstk 2,0,0,0 21863d1a8abSmrg // Transfer inputs to FP registers. 21963d1a8abSmrg setf.sig f14 = in0 22063d1a8abSmrg setf.sig f9 = in1 22163d1a8abSmrg // Check divide by zero. 22263d1a8abSmrg cmp.ne.unc p0,p7=0,in1 22363d1a8abSmrg ;; 22463d1a8abSmrg // Convert the inputs to FP, so that they won't be treated as unsigned. 22563d1a8abSmrg fcvt.xf f8 = f14 22663d1a8abSmrg fcvt.xf f9 = f9 22763d1a8abSmrg(p7) break 1 22863d1a8abSmrg ;; 22963d1a8abSmrg // Compute the reciprocal approximation. 23063d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 23163d1a8abSmrg ;; 23263d1a8abSmrg // 3 Newton-Raphson iterations. 23363d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 23463d1a8abSmrg(p6) fnma.s1 f11 = f9, f10, f1 23563d1a8abSmrg ;; 23663d1a8abSmrg(p6) fma.s1 f12 = f11, f12, f12 23763d1a8abSmrg(p6) fmpy.s1 f13 = f11, f11 23863d1a8abSmrg ;; 23963d1a8abSmrg(p6) fma.s1 f10 = f11, f10, f10 24063d1a8abSmrg(p6) fma.s1 f11 = f13, f12, f12 24163d1a8abSmrg ;; 24263d1a8abSmrg sub in1 = r0, in1 24363d1a8abSmrg(p6) fma.s1 f10 = f13, f10, f10 24463d1a8abSmrg(p6) fnma.s1 f12 = f9, f11, f8 24563d1a8abSmrg ;; 24663d1a8abSmrg setf.sig f9 = in1 24763d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f11 24863d1a8abSmrg ;; 24963d1a8abSmrg fcvt.fx.trunc.s1 f10 = f10 25063d1a8abSmrg ;; 25163d1a8abSmrg // r = q * (-b) + a 25263d1a8abSmrg xma.l f10 = f10, f9, f14 25363d1a8abSmrg ;; 25463d1a8abSmrg // Transfer result to GP registers. 25563d1a8abSmrg getf.sig ret0 = f10 25663d1a8abSmrg br.ret.sptk rp 25763d1a8abSmrg ;; 25863d1a8abSmrg .endp __moddi3 25963d1a8abSmrg#endif 26063d1a8abSmrg 26163d1a8abSmrg#ifdef L__udivdi3 26263d1a8abSmrg// Compute a 64-bit unsigned integer quotient. 26363d1a8abSmrg// 26463d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 26563d1a8abSmrg// alternative. 26663d1a8abSmrg// 26763d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 26863d1a8abSmrg 26963d1a8abSmrg .text 27063d1a8abSmrg .align 16 27163d1a8abSmrg .global __udivdi3 27263d1a8abSmrg .proc __udivdi3 27363d1a8abSmrg__udivdi3: 27463d1a8abSmrg .regstk 2,0,0,0 27563d1a8abSmrg // Transfer inputs to FP registers. 27663d1a8abSmrg setf.sig f8 = in0 27763d1a8abSmrg setf.sig f9 = in1 27863d1a8abSmrg // Check divide by zero. 27963d1a8abSmrg cmp.ne.unc p0,p7=0,in1 28063d1a8abSmrg ;; 28163d1a8abSmrg // Convert the inputs to FP, to avoid FP software-assist faults. 28263d1a8abSmrg fcvt.xuf.s1 f8 = f8 28363d1a8abSmrg fcvt.xuf.s1 f9 = f9 28463d1a8abSmrg(p7) break 1 28563d1a8abSmrg ;; 28663d1a8abSmrg // Compute the reciprocal approximation. 28763d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 28863d1a8abSmrg ;; 28963d1a8abSmrg // 3 Newton-Raphson iterations. 29063d1a8abSmrg(p6) fnma.s1 f11 = f9, f10, f1 29163d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 29263d1a8abSmrg ;; 29363d1a8abSmrg(p6) fmpy.s1 f13 = f11, f11 29463d1a8abSmrg(p6) fma.s1 f12 = f11, f12, f12 29563d1a8abSmrg ;; 29663d1a8abSmrg(p6) fma.s1 f10 = f11, f10, f10 29763d1a8abSmrg(p6) fma.s1 f11 = f13, f12, f12 29863d1a8abSmrg ;; 29963d1a8abSmrg(p6) fma.s1 f10 = f13, f10, f10 30063d1a8abSmrg(p6) fnma.s1 f12 = f9, f11, f8 30163d1a8abSmrg ;; 30263d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f11 30363d1a8abSmrg ;; 30463d1a8abSmrg // Round quotient to an unsigned integer. 30563d1a8abSmrg fcvt.fxu.trunc.s1 f10 = f10 30663d1a8abSmrg ;; 30763d1a8abSmrg // Transfer result to GP registers. 30863d1a8abSmrg getf.sig ret0 = f10 30963d1a8abSmrg br.ret.sptk rp 31063d1a8abSmrg ;; 31163d1a8abSmrg .endp __udivdi3 31263d1a8abSmrg#endif 31363d1a8abSmrg 31463d1a8abSmrg#ifdef L__umoddi3 31563d1a8abSmrg// Compute a 64-bit unsigned integer modulus. 31663d1a8abSmrg// 31763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 31863d1a8abSmrg// alternative. 31963d1a8abSmrg// 32063d1a8abSmrg// in0 holds the dividend (a). in1 holds the divisor (b). 32163d1a8abSmrg 32263d1a8abSmrg .text 32363d1a8abSmrg .align 16 32463d1a8abSmrg .global __umoddi3 32563d1a8abSmrg .proc __umoddi3 32663d1a8abSmrg__umoddi3: 32763d1a8abSmrg .regstk 2,0,0,0 32863d1a8abSmrg // Transfer inputs to FP registers. 32963d1a8abSmrg setf.sig f14 = in0 33063d1a8abSmrg setf.sig f9 = in1 33163d1a8abSmrg // Check divide by zero. 33263d1a8abSmrg cmp.ne.unc p0,p7=0,in1 33363d1a8abSmrg ;; 33463d1a8abSmrg // Convert the inputs to FP, to avoid FP software assist faults. 33563d1a8abSmrg fcvt.xuf.s1 f8 = f14 33663d1a8abSmrg fcvt.xuf.s1 f9 = f9 33763d1a8abSmrg(p7) break 1; 33863d1a8abSmrg ;; 33963d1a8abSmrg // Compute the reciprocal approximation. 34063d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 34163d1a8abSmrg ;; 34263d1a8abSmrg // 3 Newton-Raphson iterations. 34363d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 34463d1a8abSmrg(p6) fnma.s1 f11 = f9, f10, f1 34563d1a8abSmrg ;; 34663d1a8abSmrg(p6) fma.s1 f12 = f11, f12, f12 34763d1a8abSmrg(p6) fmpy.s1 f13 = f11, f11 34863d1a8abSmrg ;; 34963d1a8abSmrg(p6) fma.s1 f10 = f11, f10, f10 35063d1a8abSmrg(p6) fma.s1 f11 = f13, f12, f12 35163d1a8abSmrg ;; 35263d1a8abSmrg sub in1 = r0, in1 35363d1a8abSmrg(p6) fma.s1 f10 = f13, f10, f10 35463d1a8abSmrg(p6) fnma.s1 f12 = f9, f11, f8 35563d1a8abSmrg ;; 35663d1a8abSmrg setf.sig f9 = in1 35763d1a8abSmrg(p6) fma.s1 f10 = f12, f10, f11 35863d1a8abSmrg ;; 35963d1a8abSmrg // Round quotient to an unsigned integer. 36063d1a8abSmrg fcvt.fxu.trunc.s1 f10 = f10 36163d1a8abSmrg ;; 36263d1a8abSmrg // r = q * (-b) + a 36363d1a8abSmrg xma.l f10 = f10, f9, f14 36463d1a8abSmrg ;; 36563d1a8abSmrg // Transfer result to GP registers. 36663d1a8abSmrg getf.sig ret0 = f10 36763d1a8abSmrg br.ret.sptk rp 36863d1a8abSmrg ;; 36963d1a8abSmrg .endp __umoddi3 37063d1a8abSmrg#endif 37163d1a8abSmrg 37263d1a8abSmrg#ifdef L__divsi3 37363d1a8abSmrg// Compute a 32-bit integer quotient. 37463d1a8abSmrg// 37563d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 37663d1a8abSmrg// alternative. 37763d1a8abSmrg// 37863d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 37963d1a8abSmrg 38063d1a8abSmrg .text 38163d1a8abSmrg .align 16 38263d1a8abSmrg .global __divsi3 38363d1a8abSmrg .proc __divsi3 38463d1a8abSmrg__divsi3: 38563d1a8abSmrg .regstk 2,0,0,0 38663d1a8abSmrg // Check divide by zero. 38763d1a8abSmrg cmp.ne.unc p0,p7=0,in1 38863d1a8abSmrg sxt4 in0 = in0 38963d1a8abSmrg sxt4 in1 = in1 39063d1a8abSmrg ;; 39163d1a8abSmrg setf.sig f8 = in0 39263d1a8abSmrg setf.sig f9 = in1 39363d1a8abSmrg(p7) break 1 39463d1a8abSmrg ;; 39563d1a8abSmrg mov r2 = 0x0ffdd 39663d1a8abSmrg fcvt.xf f8 = f8 39763d1a8abSmrg fcvt.xf f9 = f9 39863d1a8abSmrg ;; 39963d1a8abSmrg setf.exp f11 = r2 40063d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 40163d1a8abSmrg ;; 40263d1a8abSmrg(p6) fmpy.s1 f8 = f8, f10 40363d1a8abSmrg(p6) fnma.s1 f9 = f9, f10, f1 40463d1a8abSmrg ;; 40563d1a8abSmrg(p6) fma.s1 f8 = f9, f8, f8 40663d1a8abSmrg(p6) fma.s1 f9 = f9, f9, f11 40763d1a8abSmrg ;; 40863d1a8abSmrg(p6) fma.s1 f10 = f9, f8, f8 40963d1a8abSmrg ;; 41063d1a8abSmrg fcvt.fx.trunc.s1 f10 = f10 41163d1a8abSmrg ;; 41263d1a8abSmrg getf.sig ret0 = f10 41363d1a8abSmrg br.ret.sptk rp 41463d1a8abSmrg ;; 41563d1a8abSmrg .endp __divsi3 41663d1a8abSmrg#endif 41763d1a8abSmrg 41863d1a8abSmrg#ifdef L__modsi3 41963d1a8abSmrg// Compute a 32-bit integer modulus. 42063d1a8abSmrg// 42163d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 42263d1a8abSmrg// alternative. 42363d1a8abSmrg// 42463d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 42563d1a8abSmrg 42663d1a8abSmrg .text 42763d1a8abSmrg .align 16 42863d1a8abSmrg .global __modsi3 42963d1a8abSmrg .proc __modsi3 43063d1a8abSmrg__modsi3: 43163d1a8abSmrg .regstk 2,0,0,0 43263d1a8abSmrg mov r2 = 0x0ffdd 43363d1a8abSmrg sxt4 in0 = in0 43463d1a8abSmrg sxt4 in1 = in1 43563d1a8abSmrg ;; 43663d1a8abSmrg setf.sig f13 = r32 43763d1a8abSmrg setf.sig f9 = r33 43863d1a8abSmrg // Check divide by zero. 43963d1a8abSmrg cmp.ne.unc p0,p7=0,in1 44063d1a8abSmrg ;; 44163d1a8abSmrg sub in1 = r0, in1 44263d1a8abSmrg fcvt.xf f8 = f13 44363d1a8abSmrg fcvt.xf f9 = f9 44463d1a8abSmrg ;; 44563d1a8abSmrg setf.exp f11 = r2 44663d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 44763d1a8abSmrg(p7) break 1 44863d1a8abSmrg ;; 44963d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 45063d1a8abSmrg(p6) fnma.s1 f10 = f9, f10, f1 45163d1a8abSmrg ;; 45263d1a8abSmrg setf.sig f9 = in1 45363d1a8abSmrg(p6) fma.s1 f12 = f10, f12, f12 45463d1a8abSmrg(p6) fma.s1 f10 = f10, f10, f11 45563d1a8abSmrg ;; 45663d1a8abSmrg(p6) fma.s1 f10 = f10, f12, f12 45763d1a8abSmrg ;; 45863d1a8abSmrg fcvt.fx.trunc.s1 f10 = f10 45963d1a8abSmrg ;; 46063d1a8abSmrg xma.l f10 = f10, f9, f13 46163d1a8abSmrg ;; 46263d1a8abSmrg getf.sig ret0 = f10 46363d1a8abSmrg br.ret.sptk rp 46463d1a8abSmrg ;; 46563d1a8abSmrg .endp __modsi3 46663d1a8abSmrg#endif 46763d1a8abSmrg 46863d1a8abSmrg#ifdef L__udivsi3 46963d1a8abSmrg// Compute a 32-bit unsigned integer quotient. 47063d1a8abSmrg// 47163d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 47263d1a8abSmrg// alternative. 47363d1a8abSmrg// 47463d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 47563d1a8abSmrg 47663d1a8abSmrg .text 47763d1a8abSmrg .align 16 47863d1a8abSmrg .global __udivsi3 47963d1a8abSmrg .proc __udivsi3 48063d1a8abSmrg__udivsi3: 48163d1a8abSmrg .regstk 2,0,0,0 48263d1a8abSmrg mov r2 = 0x0ffdd 48363d1a8abSmrg zxt4 in0 = in0 48463d1a8abSmrg zxt4 in1 = in1 48563d1a8abSmrg ;; 48663d1a8abSmrg setf.sig f8 = in0 48763d1a8abSmrg setf.sig f9 = in1 48863d1a8abSmrg // Check divide by zero. 48963d1a8abSmrg cmp.ne.unc p0,p7=0,in1 49063d1a8abSmrg ;; 49163d1a8abSmrg fcvt.xf f8 = f8 49263d1a8abSmrg fcvt.xf f9 = f9 49363d1a8abSmrg(p7) break 1 49463d1a8abSmrg ;; 49563d1a8abSmrg setf.exp f11 = r2 49663d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 49763d1a8abSmrg ;; 49863d1a8abSmrg(p6) fmpy.s1 f8 = f8, f10 49963d1a8abSmrg(p6) fnma.s1 f9 = f9, f10, f1 50063d1a8abSmrg ;; 50163d1a8abSmrg(p6) fma.s1 f8 = f9, f8, f8 50263d1a8abSmrg(p6) fma.s1 f9 = f9, f9, f11 50363d1a8abSmrg ;; 50463d1a8abSmrg(p6) fma.s1 f10 = f9, f8, f8 50563d1a8abSmrg ;; 50663d1a8abSmrg fcvt.fxu.trunc.s1 f10 = f10 50763d1a8abSmrg ;; 50863d1a8abSmrg getf.sig ret0 = f10 50963d1a8abSmrg br.ret.sptk rp 51063d1a8abSmrg ;; 51163d1a8abSmrg .endp __udivsi3 51263d1a8abSmrg#endif 51363d1a8abSmrg 51463d1a8abSmrg#ifdef L__umodsi3 51563d1a8abSmrg// Compute a 32-bit unsigned integer modulus. 51663d1a8abSmrg// 51763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency 51863d1a8abSmrg// alternative. 51963d1a8abSmrg// 52063d1a8abSmrg// in0 holds the dividend. in1 holds the divisor. 52163d1a8abSmrg 52263d1a8abSmrg .text 52363d1a8abSmrg .align 16 52463d1a8abSmrg .global __umodsi3 52563d1a8abSmrg .proc __umodsi3 52663d1a8abSmrg__umodsi3: 52763d1a8abSmrg .regstk 2,0,0,0 52863d1a8abSmrg mov r2 = 0x0ffdd 52963d1a8abSmrg zxt4 in0 = in0 53063d1a8abSmrg zxt4 in1 = in1 53163d1a8abSmrg ;; 53263d1a8abSmrg setf.sig f13 = in0 53363d1a8abSmrg setf.sig f9 = in1 53463d1a8abSmrg // Check divide by zero. 53563d1a8abSmrg cmp.ne.unc p0,p7=0,in1 53663d1a8abSmrg ;; 53763d1a8abSmrg sub in1 = r0, in1 53863d1a8abSmrg fcvt.xf f8 = f13 53963d1a8abSmrg fcvt.xf f9 = f9 54063d1a8abSmrg ;; 54163d1a8abSmrg setf.exp f11 = r2 54263d1a8abSmrg frcpa.s1 f10, p6 = f8, f9 54363d1a8abSmrg(p7) break 1; 54463d1a8abSmrg ;; 54563d1a8abSmrg(p6) fmpy.s1 f12 = f8, f10 54663d1a8abSmrg(p6) fnma.s1 f10 = f9, f10, f1 54763d1a8abSmrg ;; 54863d1a8abSmrg setf.sig f9 = in1 54963d1a8abSmrg(p6) fma.s1 f12 = f10, f12, f12 55063d1a8abSmrg(p6) fma.s1 f10 = f10, f10, f11 55163d1a8abSmrg ;; 55263d1a8abSmrg(p6) fma.s1 f10 = f10, f12, f12 55363d1a8abSmrg ;; 55463d1a8abSmrg fcvt.fxu.trunc.s1 f10 = f10 55563d1a8abSmrg ;; 55663d1a8abSmrg xma.l f10 = f10, f9, f13 55763d1a8abSmrg ;; 55863d1a8abSmrg getf.sig ret0 = f10 55963d1a8abSmrg br.ret.sptk rp 56063d1a8abSmrg ;; 56163d1a8abSmrg .endp __umodsi3 56263d1a8abSmrg#endif 56363d1a8abSmrg 56463d1a8abSmrg#ifdef L__save_stack_nonlocal 56563d1a8abSmrg// Notes on save/restore stack nonlocal: We read ar.bsp but write 56663d1a8abSmrg// ar.bspstore. This is because ar.bsp can be read at all times 56763d1a8abSmrg// (independent of the RSE mode) but since it's read-only we need to 56863d1a8abSmrg// restore the value via ar.bspstore. This is OK because 56963d1a8abSmrg// ar.bsp==ar.bspstore after executing "flushrs". 57063d1a8abSmrg 57163d1a8abSmrg// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) 57263d1a8abSmrg 57363d1a8abSmrg .text 57463d1a8abSmrg .align 16 57563d1a8abSmrg .global __ia64_save_stack_nonlocal 57663d1a8abSmrg .proc __ia64_save_stack_nonlocal 57763d1a8abSmrg__ia64_save_stack_nonlocal: 57863d1a8abSmrg { .mmf 57963d1a8abSmrg alloc r18 = ar.pfs, 2, 0, 0, 0 58063d1a8abSmrg mov r19 = ar.rsc 58163d1a8abSmrg ;; 58263d1a8abSmrg } 58363d1a8abSmrg { .mmi 58463d1a8abSmrg flushrs 58563d1a8abSmrg st8 [in0] = in1, 24 58663d1a8abSmrg and r19 = 0x1c, r19 58763d1a8abSmrg ;; 58863d1a8abSmrg } 58963d1a8abSmrg { .mmi 59063d1a8abSmrg st8 [in0] = r18, -16 59163d1a8abSmrg mov ar.rsc = r19 59263d1a8abSmrg or r19 = 0x3, r19 59363d1a8abSmrg ;; 59463d1a8abSmrg } 59563d1a8abSmrg { .mmi 59663d1a8abSmrg mov r16 = ar.bsp 59763d1a8abSmrg mov r17 = ar.rnat 59863d1a8abSmrg adds r2 = 8, in0 59963d1a8abSmrg ;; 60063d1a8abSmrg } 60163d1a8abSmrg { .mmi 60263d1a8abSmrg st8 [in0] = r16 60363d1a8abSmrg st8 [r2] = r17 60463d1a8abSmrg } 60563d1a8abSmrg { .mib 60663d1a8abSmrg mov ar.rsc = r19 60763d1a8abSmrg br.ret.sptk.few rp 60863d1a8abSmrg ;; 60963d1a8abSmrg } 61063d1a8abSmrg .endp __ia64_save_stack_nonlocal 61163d1a8abSmrg#endif 61263d1a8abSmrg 61363d1a8abSmrg#ifdef L__nonlocal_goto 61463d1a8abSmrg// void __ia64_nonlocal_goto(void *target_label, void *save_area, 61563d1a8abSmrg// void *static_chain); 61663d1a8abSmrg 61763d1a8abSmrg .text 61863d1a8abSmrg .align 16 61963d1a8abSmrg .global __ia64_nonlocal_goto 62063d1a8abSmrg .proc __ia64_nonlocal_goto 62163d1a8abSmrg__ia64_nonlocal_goto: 62263d1a8abSmrg { .mmi 62363d1a8abSmrg alloc r20 = ar.pfs, 3, 0, 0, 0 62463d1a8abSmrg ld8 r12 = [in1], 8 62563d1a8abSmrg mov.ret.sptk rp = in0, .L0 62663d1a8abSmrg ;; 62763d1a8abSmrg } 62863d1a8abSmrg { .mmf 62963d1a8abSmrg ld8 r16 = [in1], 8 63063d1a8abSmrg mov r19 = ar.rsc 63163d1a8abSmrg ;; 63263d1a8abSmrg } 63363d1a8abSmrg { .mmi 63463d1a8abSmrg flushrs 63563d1a8abSmrg ld8 r17 = [in1], 8 63663d1a8abSmrg and r19 = 0x1c, r19 63763d1a8abSmrg ;; 63863d1a8abSmrg } 63963d1a8abSmrg { .mmi 64063d1a8abSmrg ld8 r18 = [in1] 64163d1a8abSmrg mov ar.rsc = r19 64263d1a8abSmrg or r19 = 0x3, r19 64363d1a8abSmrg ;; 64463d1a8abSmrg } 64563d1a8abSmrg { .mmi 64663d1a8abSmrg mov ar.bspstore = r16 64763d1a8abSmrg ;; 64863d1a8abSmrg mov ar.rnat = r17 64963d1a8abSmrg ;; 65063d1a8abSmrg } 65163d1a8abSmrg { .mmi 65263d1a8abSmrg loadrs 65363d1a8abSmrg invala 65463d1a8abSmrg mov r15 = in2 65563d1a8abSmrg ;; 65663d1a8abSmrg } 65763d1a8abSmrg.L0: { .mib 65863d1a8abSmrg mov ar.rsc = r19 65963d1a8abSmrg mov ar.pfs = r18 66063d1a8abSmrg br.ret.sptk.few rp 66163d1a8abSmrg ;; 66263d1a8abSmrg } 66363d1a8abSmrg .endp __ia64_nonlocal_goto 66463d1a8abSmrg#endif 66563d1a8abSmrg 66663d1a8abSmrg#ifdef L__restore_stack_nonlocal 66763d1a8abSmrg// This is mostly the same as nonlocal_goto above. 66863d1a8abSmrg// ??? This has not been tested yet. 66963d1a8abSmrg 67063d1a8abSmrg// void __ia64_restore_stack_nonlocal(void *save_area) 67163d1a8abSmrg 67263d1a8abSmrg .text 67363d1a8abSmrg .align 16 67463d1a8abSmrg .global __ia64_restore_stack_nonlocal 67563d1a8abSmrg .proc __ia64_restore_stack_nonlocal 67663d1a8abSmrg__ia64_restore_stack_nonlocal: 67763d1a8abSmrg { .mmf 67863d1a8abSmrg alloc r20 = ar.pfs, 4, 0, 0, 0 67963d1a8abSmrg ld8 r12 = [in0], 8 68063d1a8abSmrg ;; 68163d1a8abSmrg } 68263d1a8abSmrg { .mmb 68363d1a8abSmrg ld8 r16=[in0], 8 68463d1a8abSmrg mov r19 = ar.rsc 68563d1a8abSmrg ;; 68663d1a8abSmrg } 68763d1a8abSmrg { .mmi 68863d1a8abSmrg flushrs 68963d1a8abSmrg ld8 r17 = [in0], 8 69063d1a8abSmrg and r19 = 0x1c, r19 69163d1a8abSmrg ;; 69263d1a8abSmrg } 69363d1a8abSmrg { .mmf 69463d1a8abSmrg ld8 r18 = [in0] 69563d1a8abSmrg mov ar.rsc = r19 69663d1a8abSmrg ;; 69763d1a8abSmrg } 69863d1a8abSmrg { .mmi 69963d1a8abSmrg mov ar.bspstore = r16 70063d1a8abSmrg ;; 70163d1a8abSmrg mov ar.rnat = r17 70263d1a8abSmrg or r19 = 0x3, r19 70363d1a8abSmrg ;; 70463d1a8abSmrg } 70563d1a8abSmrg { .mmf 70663d1a8abSmrg loadrs 70763d1a8abSmrg invala 70863d1a8abSmrg ;; 70963d1a8abSmrg } 71063d1a8abSmrg.L0: { .mib 71163d1a8abSmrg mov ar.rsc = r19 71263d1a8abSmrg mov ar.pfs = r18 71363d1a8abSmrg br.ret.sptk.few rp 71463d1a8abSmrg ;; 71563d1a8abSmrg } 71663d1a8abSmrg .endp __ia64_restore_stack_nonlocal 71763d1a8abSmrg#endif 71863d1a8abSmrg 71963d1a8abSmrg#ifdef L__trampoline 72063d1a8abSmrg// Implement the nested function trampoline. This is out of line 72163d1a8abSmrg// so that we don't have to bother with flushing the icache, as 72263d1a8abSmrg// well as making the on-stack trampoline smaller. 72363d1a8abSmrg// 72463d1a8abSmrg// The trampoline has the following form: 72563d1a8abSmrg// 72663d1a8abSmrg// +-------------------+ > 72763d1a8abSmrg// TRAMP: | __ia64_trampoline | | 72863d1a8abSmrg// +-------------------+ > fake function descriptor 72963d1a8abSmrg// | TRAMP+16 | | 73063d1a8abSmrg// +-------------------+ > 73163d1a8abSmrg// | target descriptor | 73263d1a8abSmrg// +-------------------+ 73363d1a8abSmrg// | static link | 73463d1a8abSmrg// +-------------------+ 73563d1a8abSmrg 73663d1a8abSmrg .text 73763d1a8abSmrg .align 16 73863d1a8abSmrg .global __ia64_trampoline 73963d1a8abSmrg .proc __ia64_trampoline 74063d1a8abSmrg__ia64_trampoline: 74163d1a8abSmrg { .mmi 74263d1a8abSmrg ld8 r2 = [r1], 8 74363d1a8abSmrg ;; 74463d1a8abSmrg ld8 r15 = [r1] 74563d1a8abSmrg } 74663d1a8abSmrg { .mmi 74763d1a8abSmrg ld8 r3 = [r2], 8 74863d1a8abSmrg ;; 74963d1a8abSmrg ld8 r1 = [r2] 75063d1a8abSmrg mov b6 = r3 75163d1a8abSmrg } 75263d1a8abSmrg { .bbb 75363d1a8abSmrg br.sptk.many b6 75463d1a8abSmrg ;; 75563d1a8abSmrg } 75663d1a8abSmrg .endp __ia64_trampoline 75763d1a8abSmrg#endif 75863d1a8abSmrg 75963d1a8abSmrg#ifdef SHARED 76063d1a8abSmrg// Thunks for backward compatibility. 76163d1a8abSmrg#ifdef L_fixtfdi 76263d1a8abSmrg .text 76363d1a8abSmrg .align 16 76463d1a8abSmrg .global __fixtfti 76563d1a8abSmrg .proc __fixtfti 76663d1a8abSmrg__fixtfti: 76763d1a8abSmrg { .bbb 76863d1a8abSmrg br.sptk.many __fixxfti 76963d1a8abSmrg ;; 77063d1a8abSmrg } 77163d1a8abSmrg .endp __fixtfti 77263d1a8abSmrg#endif 77363d1a8abSmrg#ifdef L_fixunstfdi 77463d1a8abSmrg .align 16 77563d1a8abSmrg .global __fixunstfti 77663d1a8abSmrg .proc __fixunstfti 77763d1a8abSmrg__fixunstfti: 77863d1a8abSmrg { .bbb 77963d1a8abSmrg br.sptk.many __fixunsxfti 78063d1a8abSmrg ;; 78163d1a8abSmrg } 78263d1a8abSmrg .endp __fixunstfti 78363d1a8abSmrg#endif 78463d1a8abSmrg#ifdef L_floatditf 78563d1a8abSmrg .align 16 78663d1a8abSmrg .global __floattitf 78763d1a8abSmrg .proc __floattitf 78863d1a8abSmrg__floattitf: 78963d1a8abSmrg { .bbb 79063d1a8abSmrg br.sptk.many __floattixf 79163d1a8abSmrg ;; 79263d1a8abSmrg } 79363d1a8abSmrg .endp __floattitf 79463d1a8abSmrg#endif 79563d1a8abSmrg#endif 796