202 lines
4.8 KiB
ArmAsm
202 lines
4.8 KiB
ArmAsm
|
# mach: bfin
|
||
|
|
||
|
// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
|
||
|
// INTERNAL STATE
|
||
|
// TWO OUTPUTS PER ITERATION
|
||
|
// This program computes a FIR filter without maintaining a buffer of internal
|
||
|
// state.
|
||
|
// This example computes two output samples per inner loop. The following
|
||
|
// diagram shows the alignment required for signal x and coefficients c:
|
||
|
// x0 x1 x2 x3 x4 x5
|
||
|
// c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ...
|
||
|
// c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ...
|
||
|
// L-1
|
||
|
// ---
|
||
|
// Z(k) = \ c(n) * x(n+k)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// Naive, first stab at spliting this for dual MACS.
|
||
|
// L/2-1 L/2-1
|
||
|
// --- ---
|
||
|
// R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))
|
||
|
// / /
|
||
|
// --- ---
|
||
|
// n=0 n=0
|
||
|
// Alternate, better partitioning for the machine.
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(0) = \ x(n) * y(n)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(1) = \ x(n) * y(n+1)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(2) = \ x(n) * y(n+2)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(3) = \ x(n) * y(n+3)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// .
|
||
|
// .
|
||
|
// .
|
||
|
// .
|
||
|
// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(2k) = \ x(n) * y(n+2k)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// L-1
|
||
|
// ---
|
||
|
// R(2k+1) = \ x(n) * y(n+2k+1)
|
||
|
// /
|
||
|
// ---
|
||
|
// n=0
|
||
|
// Implementation
|
||
|
// --------------
|
||
|
// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
|
||
|
// is loaded into register R1:
|
||
|
// +-------+ R0
|
||
|
// | x1 x0 |
|
||
|
// +-------+
|
||
|
// +-------+ R1
|
||
|
// | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
|
||
|
// +-------+
|
||
|
// Now load x2 into lo half of R0, and compute the next two MACs:
|
||
|
// +-------+ R0
|
||
|
// | x1 x2 |
|
||
|
// +-------+
|
||
|
// +-------+ R1
|
||
|
// | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
|
||
|
// +-------+
|
||
|
// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
|
||
|
// +-------+ R0
|
||
|
// | x3 x2 |
|
||
|
// +-------+
|
||
|
// +-------+ R2
|
||
|
// | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
|
||
|
// +-------+
|
||
|
// Load x4 into low half of R0:
|
||
|
// +-------+ R0
|
||
|
// | x3 x4 |
|
||
|
// +-------+
|
||
|
// +-------+ R1
|
||
|
// | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
|
||
|
// +-------+
|
||
|
// //This is a reference FIR function used to test: */
|
||
|
//void firf (float input[], float output[], float coeffs[],
|
||
|
// long input_size, long coeffs_size)
|
||
|
//{
|
||
|
// long i, k;
|
||
|
// for(i=0; i< input_size; i++){
|
||
|
// output[i] = 0;
|
||
|
// for(k=0; k < coeffs_size; k++)
|
||
|
// output[i] += input[k+i] * coeffs[k];
|
||
|
// }
|
||
|
//}
|
||
|
|
||
|
.include "testutils.inc"
|
||
|
start
|
||
|
|
||
|
|
||
|
R0 = 0; R1 = 0; R2 = 0;
|
||
|
P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2
|
||
|
P2 = 64 (X);
|
||
|
|
||
|
// P0 holds pointer to input data in one memory
|
||
|
// bank. Increments by 2 after each inner-loop iter
|
||
|
loadsym P0, input;
|
||
|
|
||
|
// Pointer to coeffs in alternate memory bank.
|
||
|
loadsym I1, coef;
|
||
|
|
||
|
// Pointer to outputs in any memory bank.
|
||
|
loadsym I2, output;
|
||
|
|
||
|
// Setup outer do-loop for M/2 iterations
|
||
|
// (2 outputs are computed per pass)
|
||
|
|
||
|
LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
|
||
|
|
||
|
L$0:
|
||
|
loadsym I1, coef;
|
||
|
I0 = P0;
|
||
|
// Set-up inner do-loop for L/2 iterations
|
||
|
// (2 MACs are computed per pass)
|
||
|
|
||
|
LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
|
||
|
|
||
|
// Load first two data elements in r0,
|
||
|
// and two coeffs into r1:
|
||
|
|
||
|
R0.L = W [ I0 ++ ];
|
||
|
A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
|
||
|
|
||
|
L$1:
|
||
|
A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
|
||
|
L$1end:
|
||
|
A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
|
||
|
|
||
|
// Line 1: do 2 MACs and load next data element into RL0.
|
||
|
// Line 2: do 2 MACs, load next data element into RH0,
|
||
|
// and load next 2 coeffs
|
||
|
|
||
|
R0.H = A1, R0.L = A0;
|
||
|
|
||
|
// advance data pointer by 2 16b elements
|
||
|
P0 += 4;
|
||
|
|
||
|
L$0end:
|
||
|
[ I2 ++ ] = R0; // store 2 outputs
|
||
|
|
||
|
// Check results
|
||
|
loadsym I2, output;
|
||
|
|
||
|
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
|
||
|
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
|
||
|
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );
|
||
|
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
|
||
|
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
|
||
|
pass
|
||
|
|
||
|
.data
|
||
|
input:
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x4000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.dw 0x0000
|
||
|
.space ((128-10)*2); // must pad with zeros or uninitialized values.
|
||
|
|
||
|
.data
|
||
|
coef:
|
||
|
.dw 0x1000
|
||
|
.dw 0x2000
|
||
|
.dw 0x4000
|
||
|
.dw 0x2000
|
||
|
.dw 0x1000
|
||
|
.dw 0x0000
|
||
|
.space ((64-6)*2); // must pad with zeros or uninitialized values.
|
||
|
|
||
|
.data
|
||
|
output:
|
||
|
.space (128*4)
|