1# mach: bfin 2 3// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO 4// INTERNAL STATE 5// TWO OUTPUTS PER ITERATION 6// This program computes a FIR filter without maintaining a buffer of internal 7// state. 8// This example computes two output samples per inner loop. The following 9// diagram shows the alignment required for signal x and coefficients c: 10// x0 x1 x2 x3 x4 x5 11// c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ... 12// c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ... 13// L-1 14// --- 15// Z(k) = \ c(n) * x(n+k) 16// / 17// --- 18// n=0 19// Naive, first stab at spliting this for dual MACS. 20// L/2-1 L/2-1 21// --- --- 22// R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k)) 23// / / 24// --- --- 25// n=0 n=0 26// Alternate, better partitioning for the machine. 27// L-1 28// --- 29// R(0) = \ x(n) * y(n) 30// / 31// --- 32// n=0 33// L-1 34// --- 35// R(1) = \ x(n) * y(n+1) 36// / 37// --- 38// n=0 39// L-1 40// --- 41// R(2) = \ x(n) * y(n+2) 42// / 43// --- 44// n=0 45// L-1 46// --- 47// R(3) = \ x(n) * y(n+3) 48// / 49// --- 50// n=0 51// . 52// . 53// . 54// . 55// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel 56// L-1 57// --- 58// R(2k) = \ x(n) * y(n+2k) 59// / 60// --- 61// n=0 62// L-1 63// --- 64// R(2k+1) = \ x(n) * y(n+2k+1) 65// / 66// --- 67// n=0 68// Implementation 69// -------------- 70// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0 71// is loaded into register R1: 72// +-------+ R0 73// | x1 x0 | 74// +-------+ 75// +-------+ R1 76// | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0 77// +-------+ 78// Now load x2 into lo half of R0, and compute the next two MACs: 79// +-------+ R0 80// | x1 x2 | 81// +-------+ 82// +-------+ R1 83// | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used) 84// +-------+ 85// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0: 86// +-------+ R0 87// | x3 x2 | 88// +-------+ 89// +-------+ R2 90// | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used) 91// +-------+ 92// Load x4 into low half of R0: 93// +-------+ R0 94// | x3 x4 | 95// +-------+ 96// +-------+ R1 97// | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used) 98// +-------+ 99// //This is a reference FIR function used to test: */ 100//void firf (float input[], float output[], float coeffs[], 101// long input_size, long coeffs_size) 102//{ 103// long i, k; 104// for(i=0; i< input_size; i++){ 105// output[i] = 0; 106// for(k=0; k < coeffs_size; k++) 107// output[i] += input[k+i] * coeffs[k]; 108// } 109//} 110 111.include "testutils.inc" 112 start 113 114 115 R0 = 0; R1 = 0; R2 = 0; 116 P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2 117 P2 = 64 (X); 118 119 // P0 holds pointer to input data in one memory 120 // bank. Increments by 2 after each inner-loop iter 121 loadsym P0, input; 122 123 // Pointer to coeffs in alternate memory bank. 124 loadsym I1, coef; 125 126 // Pointer to outputs in any memory bank. 127 loadsym I2, output; 128 129 // Setup outer do-loop for M/2 iterations 130 // (2 outputs are computed per pass) 131 132 LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1; 133 134L$0: 135 loadsym I1, coef; 136 I0 = P0; 137 // Set-up inner do-loop for L/2 iterations 138 // (2 MACs are computed per pass) 139 140 LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1; 141 142 // Load first two data elements in r0, 143 // and two coeffs into r1: 144 145 R0.L = W [ I0 ++ ]; 146 A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ]; 147 148L$1: 149 A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP; 150L$1end: 151 A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ]; 152 153 // Line 1: do 2 MACs and load next data element into RL0. 154 // Line 2: do 2 MACs, load next data element into RH0, 155 // and load next 2 coeffs 156 157 R0.H = A1, R0.L = A0; 158 159 // advance data pointer by 2 16b elements 160 P0 += 4; 161 162L$0end: 163 [ I2 ++ ] = R0; // store 2 outputs 164 165 // Check results 166 loadsym I2, output; 167 168 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 ); 169 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 ); 170 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 ); 171 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 ); 172 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 ); 173 pass 174 175 .data 176input: 177 .dw 0x0000 178 .dw 0x0000 179 .dw 0x0000 180 .dw 0x0000 181 .dw 0x4000 182 .dw 0x0000 183 .dw 0x0000 184 .dw 0x0000 185 .dw 0x0000 186 .dw 0x0000 187 .space ((128-10)*2); // must pad with zeros or uninitialized values. 188 189 .data 190coef: 191 .dw 0x1000 192 .dw 0x2000 193 .dw 0x4000 194 .dw 0x2000 195 .dw 0x1000 196 .dw 0x0000 197 .space ((64-6)*2); // must pad with zeros or uninitialized values. 198 199 .data 200output: 201 .space (128*4) 202