1# mach: bfin
2
3// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
4//   INTERNAL STATE
5//   TWO OUTPUTS PER ITERATION
6// This program computes a FIR filter without maintaining a buffer of internal
7// state.
8// This example computes two output samples per inner loop. The following
9// diagram shows the alignment required for signal x and coefficients c:
10// x0 x1 x2 x3 x4 x5
11// c0 c1 c2 c3 c4      -> output z(0)=x0*c0 + x1*c1 + ...
12//    c0 c1 c2 c3 c4   ->        z(1)=x1*c0 + x2*c1 + ...
13//	       L-1
14//               ---
15//      Z(k) =   \   c(n) * x(n+k)
16//               /
17//	         ---
18//       	       n=0
19// Naive, first stab at spliting this for dual MACS.
20//	       L/2-1                     L/2-1
21//               --- 		           ---
22//      R(k) =   \   (x(2n) * y(2n+k))  +  \   (x(2n-1) * y(2n-1+k))
23//               /  		           /
24//	         --- 		           ---
25//       	       n=0		         n=0
26// Alternate, better partitioning for the machine.
27//	       L-1
28//               ---
29//      R(0) =   \   x(n) * y(n)
30//               /
31//	         ---
32//       	 n=0
33//	       L-1
34//               ---
35//      R(1) =   \   x(n) * y(n+1)
36//               /
37//	         ---
38//              n=0
39//	       L-1
40//               ---
41//      R(2) =   \   x(n) * y(n+2)
42//               /
43//	         ---
44//              n=0
45//	       L-1
46//               ---
47//      R(3) =   \   x(n) * y(n+3)
48//               /
49//	         ---
50//               n=0
51//		.
52//		.
53//		.
54//		.
55// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
56//	       L-1
57//               ---
58//     R(2k) =   \   x(n) * y(n+2k)
59//               /
60//	         ---
61//              n=0
62//	       L-1
63//               ---
64//   R(2k+1) =   \   x(n) * y(n+2k+1)
65//               /
66//	         ---
67//              n=0
68// Implementation
69// --------------
70// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
71// is loaded into register R1:
72// +-------+ R0
73// | x1 x0 |
74// +-------+
75// +-------+ R1
76// | c1 c0 |  compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
77// +-------+
78// Now load x2 into lo half of R0, and compute the next two MACs:
79// +-------+ R0
80// | x1 x2 |
81// +-------+
82// +-------+ R1
83// | c1 c0 |    compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
84// +-------+
85// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
86// +-------+ R0
87// | x3 x2 |
88// +-------+
89// +-------+ R2
90// | c3 c2 |    compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
91// +-------+
92// Load x4 into low half of R0:
93// +-------+ R0
94// | x3 x4 |
95// +-------+
96// +-------+ R1
97// | c3 c2 |    compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
98// +-------+
99// //This is a reference FIR function used to test: */
100//void firf (float input[], float  output[], float coeffs[],
101//           long input_size, long coeffs_size)
102//{
103//  long i, k;
104//  for(i=0;	i< input_size; i++){
105//    output[i] = 0;
106//    for(k=0;	k < coeffs_size; k++)
107//	output[i] += input[k+i] * coeffs[k];
108// }
109//}
110
111.include "testutils.inc"
112	start
113
114
115	R0 = 0;	R1 = 0; R2 = 0;
116	P1 = 128 (X);	// Load loop bounds in R5, R6, and divide by 2
117	P2 = 64 (X);
118
119	// P0 holds pointer to input data in one memory
120	// bank. Increments by 2 after each inner-loop iter
121	loadsym P0, input;
122
123	// Pointer to coeffs in alternate memory bank.
124	loadsym I1, coef;
125
126	// Pointer to outputs in any memory bank.
127	loadsym I2, output;
128
129	// Setup outer do-loop for M/2 iterations
130	// (2 outputs are computed per pass)
131
132	LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
133
134L$0:
135	loadsym I1, coef;
136	I0 = P0;
137		// Set-up inner do-loop for L/2 iterations
138		// (2 MACs are computed per pass)
139
140	LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
141
142		// Load first two data elements in r0,
143		// and two coeffs into r1:
144
145	R0.L = W [ I0 ++ ];
146	A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
147
148L$1:
149	A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
150L$1end:
151	A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
152
153	// Line 1: do 2 MACs and load next data element into RL0.
154	// Line 2: do 2 MACs, load next data element into RH0,
155	// and load next 2 coeffs
156
157	R0.H = A1, R0.L = A0;
158
159		// advance data pointer by 2 16b elements
160	P0 += 4;
161
162L$0end:
163	[ I2 ++ ] = R0;	// store 2 outputs
164
165	// Check results
166	loadsym I2, output;
167
168	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x0800 );
169	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x1000 );
170	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x2000 );
171	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x1000 );
172	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x0800 );
173	pass
174
175	.data
176input:
177	.dw 0x0000
178	.dw 0x0000
179	.dw 0x0000
180	.dw 0x0000
181	.dw 0x4000
182	.dw 0x0000
183	.dw 0x0000
184	.dw 0x0000
185	.dw 0x0000
186	.dw 0x0000
187	.space ((128-10)*2);	// must pad with zeros or uninitialized values.
188
189	.data
190coef:
191	.dw 0x1000
192	.dw 0x2000
193	.dw 0x4000
194	.dw 0x2000
195	.dw 0x1000
196	.dw 0x0000
197	.space ((64-6)*2);	// must pad with zeros or uninitialized values.
198
199	.data
200output:
201	.space (128*4)
202