libdivide/libdivide-5.0/constant_fast_div.h

/*
* When dividing by a known compile time constant, the division can be replaced
* by a multiply+shift operation. GCC will do this automatically,
* *BUT ONLY FOR DIVISION OF REGISTER-WIDTH OR NARROWER*.
*
* So on an 8-bit system, 16-bit divides will *NOT* be optimised.
*
* The macros here manually apply the multiply+shift operation for 16-bit numbers.
*
* Testing on an AtMega2560, -O3 optimizations:
*   Performance improvement of 85% to 90%+ speed up (division by non-powers of 2)
*   Zero increase in RAM usage
*   Average of 25 bytes Flash used per call site
*     Be careful calling this in a loop with aggressive loop unrolling!
*
* Note: testing of the multiply+shift technique on 8-bit division showed a
* slight slow down over native code on AtMega2560. So the 8 bit equivalent
* macros have not been included
*/

#pragma once
#include "libdivide.h"
#include "u16_ldparams.h"
#include "s16_ldparams.h"

#define CAT_HELPER(a, b) a ## b
#define CONCAT(A, B) CAT_HELPER(A, B)

// GCC will optimise division by a power of 2
// So allow that.
#define S16_ISPOW2_NEG(denom) \
 (denom==-2 || \
  denom==-4 || \
  denom==-8 || \
  denom==-16 || \
  denom==-32 || \
  denom==-64 || \
  denom==-128 || \
  denom==-256 || \
  denom==-512 || \
  denom==-1024 || \
  denom==-2048 || \
  denom==-4096 || \
  denom==-8192 || \
  denom==-16384)
#define S16_ISPOW2_POS(denom) \
 (denom==2 || \
  denom==4 || \
  denom==8 || \
  denom==16 || \
  denom==32 || \
  denom==64 || \
  denom==128 || \
  denom==256 || \
  denom==512 || \
  denom==1024 || \
  denom==2048 || \
  denom==4096 || \
  denom==8192 || \
  denom==16384)
#define U16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || denom==32768)
#define S16_ISPOW2(denom) (S16_ISPOW2_POS(denom) || S16_ISPOW2_NEG(denom))

// Apply the libdivide namespace if necessary
#ifdef __cplusplus
#define LIB_DIV_NAMESPACE libdivide::
#else
#define LIB_DIV_NAMESPACE
#endif

/*
* Wrapper for *unsigned* 16-bit DIVISION. The divisor must be a compile time
* constant.
* E.g. FAST_DIV16U(value, 100)
*/
#define U16_MAGIC(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MAGIC)
#define U16_MORE(d) CONCAT(CONCAT(U16LD_DENOM_, d), _MORE)
#define FAST_DIV16U(a, d) (U16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_u16_do_raw(a, U16_MAGIC(d), U16_MORE(d)))

/*
* Wrapper for *signed* 16-bit DIVISION by a *POSITIVE* compile time constant.
* E.g. FAST_DIV16(-value, 777)
*
* This only works for positive parmeters :-(
* A negative number results in a hypen in the macro name, which is not allowed
*/
#define S16_MAGIC(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MAGIC)
#define S16_MORE(d) CONCAT(CONCAT(S16LD_DENOM_, d), _MORE)
#define FAST_DIV16(a, d) (S16_ISPOW2(d) ? a/d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC(d), S16_MORE(d)))

/*
* Wrapper for *signed* 16-bit DIVISION by a *NEGATIVE* compile time constant.
* E.g. FAST_DIV16_NEG(-value, 777) // <-- It's converted to negative. Really.
*
* This only works for positive parmeters :-(
* A negative number results in a hypen in the macro name, which is not allowed
*/
#define S16_MAGIC_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MAGIC)
#define S16_MORE_NEG(d) CONCAT(CONCAT(S16LD_DENOM_MINUS_, d), _MORE)
#define FAST_DIV16_NEG(a, d) (S16_ISPOW2(d) ? a/-d : LIB_DIV_NAMESPACE libdivide_s16_do_raw(a, S16_MAGIC_NEG(d), S16_MORE_NEG(d)))

/*
* Wrapper for *unsigned* 16-bit MODULUS. The divisor must be a compile time
* constant.
* E.g. FAST_MOD16U(value, 6)
*/
#define FAST_MOD16U(a, d) (a - (FAST_DIV16U(a, d) * d))