1/* Copyright (C) 2011-2013 Free Software Foundation, Inc. 2 Contributed by Embecosm on behalf of Adapteva, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify it under 7the terms of the GNU General Public License as published by the Free 8Software Foundation; either version 3, or (at your option) any later 9version. 10 11GCC is distributed in the hope that it will be useful, but WITHOUT ANY 12WARRANTY; without even the implied warranty of MERCHANTABILITY or 13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14for more details. 15 16Under Section 7 of GPL version 3, you are granted additional 17permissions described in the GCC Runtime Library Exception, version 183.1, as published by the Free Software Foundation. 19 20You should have received a copy of the GNU General Public License and 21a copy of the GCC Runtime Library Exception along with this program; 22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23<http://www.gnu.org/licenses/>. */ 24 25#include "../epiphany-asm.h" 26 27.section _fast_div_text,"a",@progbits; 28 .balign 8; 29_fast_div_table: 30.word 0x007fffff// mantissa mask 31.word 0x40257ebb// hold constant a = 2.58586 32 33.word 0x3f000000// hold constant 126 shifted to bits [30:23] 34.word 0xc0ba2e88// hold constant b = -5.81818 35 36.word 0x4087c1e8// hold constant c = 4.24242 37.word 0x40000000// to hold constant 2 for Newton-Raphson iterations 38 39 .global SYM(__fast_recipsf2) 40 FUNC(__fast_recipsf2) 41SYM(__fast_recipsf2): 42 43//################### 44//# input operands: 45//################### 46// Divisor 47//R0 48// Function address (used with negative offsets to read _fast_div_table) 49//R1 50/* Scratch registers: two single (TMP0/TMP5) and two pairs. */ 51#define P0L TMP1 52#define P0H TMP2 53#define P1L TMP3 54#define P1H TMP4 55 56//######################################### 57//# Constants to be used in the algorithm 58//######################################### 59ldrd P0L , [ R1 , -3 ] 60 61ldrd P1L , [ R1 , -2 ] 62 63 64 65//############################################################################# 66//# The Algorithm 67//# 68//# Operation: C=A/B 69//# stage 1 - find the reciprocal 1/B according to the following scheme: 70//# B = (2^E)*m (1<m<2, E=e-127) 71//# 1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1) (0.5<m1<1) 72//# = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1) 73//# 74//# Now we can find the new exponent: 75//# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e ** 76//# 1/m1 alreadt has the exponent 127, so we have to add 126-e. 77//# the exponent might underflow, which we can detect as a sign change. 78//# Since the architeture uses flush-to-zero for subnormals, we can 79//# give the result 0. then. 80//# 81//# The 1/m1 term with 0.5<m1<1 is approximated with the Chebyshev polynomial 82//# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242 83//# 84//# Next step is to use two iterations of Newton-Raphson algorithm to complete 85//# the reciprocal calculation. 86//# 87//# Final result is achieved by multiplying A with 1/B 88//############################################################################# 89 90 91 92// R0 exponent and sign "replacement" into TMP0 93AND TMP0,R0,P0L ; 94ORR TMP0,TMP0,P1L 95SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5 96// Calculate new mantissa 97FMADD P1H,TMP0,P0H ; 98 // Calculate new exponent offset 126 - "old exponent" 99 SUB P1L,P1L,TMP5 100 ldrd P0L , [ R1 , -1 ] 101FMADD P0L,TMP0,P1H ; 102 eor P1H,r0,P1L // check for overflow (N-BIT). 103 blt .Lret_0 104// P0L exponent and sign "replacement" 105sub P0L,P0L,TMP5 106 107// Newton-Raphson iteration #1 108MOV TMP0,P0H ; 109FMSUB P0H,R0,P0L ; 110FMUL P0L,P0H,P0L ; 111// Newton-Raphson iteration #2 112FMSUB TMP0,R0,P0L ; 113FMUL R0,TMP0,P0L ; 114jr lr 115.Lret_0:ldrd P0L , [ R1 , -3 ] 116 lsr TMP0,r0,31 ; extract sign 117 lsl TMP0,TMP0,31 118 add P0L,P0L,r0 ; check for NaN input 119 eor P0L,P0L,r0 120 movgte r0,TMP0 121 jr lr 122// Quotient calculation is expected by the caller: FMUL quotient,divident,R0 123 ; 124 ENDFUNC(__fast_recipsf2) 125