Projet_SETI_RISC-V/riscv-gnu-toolchain/binutils/sim/testsuite/bfin/fir.s

# mach: bfin

// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
//   INTERNAL STATE
//   TWO OUTPUTS PER ITERATION
// This program computes a FIR filter without maintaining a buffer of internal
// state.
// This example computes two output samples per inner loop. The following
// diagram shows the alignment required for signal x and coefficients c:
// x0 x1 x2 x3 x4 x5
// c0 c1 c2 c3 c4      -> output z(0)=x0*c0 + x1*c1 + ...
//    c0 c1 c2 c3 c4   ->        z(1)=x1*c0 + x2*c1 + ...
//	       L-1
//               ---
//      Z(k) =   \   c(n) * x(n+k)
//               /
//	         ---
//       	       n=0
// Naive, first stab at spliting this for dual MACS.
//	       L/2-1                     L/2-1
//               --- 		           ---
//      R(k) =   \   (x(2n) * y(2n+k))  +  \   (x(2n-1) * y(2n-1+k))
//               /  		           /
//	         --- 		           ---
//       	       n=0		         n=0
// Alternate, better partitioning for the machine.
//	       L-1
//               ---
//      R(0) =   \   x(n) * y(n)
//               /
//	         ---
//       	 n=0
//	       L-1
//               ---
//      R(1) =   \   x(n) * y(n+1)
//               /
//	         ---
//              n=0
//	       L-1
//               ---
//      R(2) =   \   x(n) * y(n+2)
//               /
//	         ---
//              n=0
//	       L-1
//               ---
//      R(3) =   \   x(n) * y(n+3)
//               /
//	         ---
//               n=0
//		.
//		.
//		.
//		.
// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
//	       L-1
//               ---
//     R(2k) =   \   x(n) * y(n+2k)
//               /
//	         ---
//              n=0
//	       L-1
//               ---
//   R(2k+1) =   \   x(n) * y(n+2k+1)
//               /
//	         ---
//              n=0
// Implementation
// --------------
// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
// is loaded into register R1:
// +-------+ R0
// | x1 x0 |
// +-------+
// +-------+ R1
// | c1 c0 |  compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
// +-------+
// Now load x2 into lo half of R0, and compute the next two MACs:
// +-------+ R0
// | x1 x2 |
// +-------+
// +-------+ R1
// | c1 c0 |    compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
// +-------+
// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
// +-------+ R0
// | x3 x2 |
// +-------+
// +-------+ R2
// | c3 c2 |    compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
// +-------+
// Load x4 into low half of R0:
// +-------+ R0
// | x3 x4 |
// +-------+
// +-------+ R1
// | c3 c2 |    compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
// +-------+
// //This is a reference FIR function used to test: */
//void firf (float input[], float  output[], float coeffs[],
//           long input_size, long coeffs_size)
//{
//  long i, k;
//  for(i=0;	i< input_size; i++){
//    output[i] = 0;
//    for(k=0;	k < coeffs_size; k++)
//	output[i] += input[k+i] * coeffs[k];
// }
//}

.include "testutils.inc"
	start


	R0 = 0;	R1 = 0; R2 = 0;
	P1 = 128 (X);	// Load loop bounds in R5, R6, and divide by 2
	P2 = 64 (X);

	// P0 holds pointer to input data in one memory
	// bank. Increments by 2 after each inner-loop iter
	loadsym P0, input;

	// Pointer to coeffs in alternate memory bank.
	loadsym I1, coef;

	// Pointer to outputs in any memory bank.
	loadsym I2, output;

	// Setup outer do-loop for M/2 iterations
	// (2 outputs are computed per pass)

	LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;

L$0:
	loadsym I1, coef;
	I0 = P0;
		// Set-up inner do-loop for L/2 iterations
		// (2 MACs are computed per pass)

	LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;

		// Load first two data elements in r0,
		// and two coeffs into r1:

	R0.L = W [ I0 ++ ];
	A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];

L$1:
	A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
L$1end:
	A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];

	// Line 1: do 2 MACs and load next data element into RL0.
	// Line 2: do 2 MACs, load next data element into RH0,
	// and load next 2 coeffs

	R0.H = A1, R0.L = A0;

		// advance data pointer by 2 16b elements
	P0 += 4;

L$0end:
	[ I2 ++ ] = R0;	// store 2 outputs

	// Check results
	loadsym I2, output;

	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x0800 );
	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x1000 );
	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x2000 );
	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x1000 );
	R0.L = W [ I2 ++ ];	DBGA ( R0.L , 0x0800 );
	pass

	.data
input:
	.dw 0x0000
	.dw 0x0000
	.dw 0x0000
	.dw 0x0000
	.dw 0x4000
	.dw 0x0000
	.dw 0x0000
	.dw 0x0000
	.dw 0x0000
	.dw 0x0000
	.space ((128-10)*2);	// must pad with zeros or uninitialized values.

	.data
coef:
	.dw 0x1000
	.dw 0x2000
	.dw 0x4000
	.dw 0x2000
	.dw 0x1000
	.dw 0x0000
	.space ((64-6)*2);	// must pad with zeros or uninitialized values.

	.data
output:
	.space (128*4)
projet 2023-03-06 14:48:14 +01:00			`# mach: bfin`

			`// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO`
			`// INTERNAL STATE`
			`// TWO OUTPUTS PER ITERATION`
			`// This program computes a FIR filter without maintaining a buffer of internal`
			`// state.`
			`// This example computes two output samples per inner loop. The following`
			`// diagram shows the alignment required for signal x and coefficients c:`
			`// x0 x1 x2 x3 x4 x5`
			`// c0 c1 c2 c3 c4 -> output z(0)=x0c0 + x1c1 + ...`
			`// c0 c1 c2 c3 c4 -> z(1)=x1c0 + x2c1 + ...`
			`// L-1`
			`// ---`
			`// Z(k) = \ c(n) * x(n+k)`
			`// /`
			`// ---`
			`// n=0`
			`// Naive, first stab at spliting this for dual MACS.`
			`// L/2-1 L/2-1`
			`// --- ---`
			`// R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))`
			`// / /`
			`// --- ---`
			`// n=0 n=0`
			`// Alternate, better partitioning for the machine.`
			`// L-1`
			`// ---`
			`// R(0) = \ x(n) * y(n)`
			`// /`
			`// ---`
			`// n=0`
			`// L-1`
			`// ---`
			`// R(1) = \ x(n) * y(n+1)`
			`// /`
			`// ---`
			`// n=0`
			`// L-1`
			`// ---`
			`// R(2) = \ x(n) * y(n+2)`
			`// /`
			`// ---`
			`// n=0`
			`// L-1`
			`// ---`
			`// R(3) = \ x(n) * y(n+3)`
			`// /`
			`// ---`
			`// n=0`
			`// .`
			`// .`
			`// .`
			`// .`
			`// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel`
			`// L-1`
			`// ---`
			`// R(2k) = \ x(n) * y(n+2k)`
			`// /`
			`// ---`
			`// n=0`
			`// L-1`
			`// ---`
			`// R(2k+1) = \ x(n) * y(n+2k+1)`
			`// /`
			`// ---`
			`// n=0`
			`// Implementation`
			`// --------------`
			`// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0`
			`// is loaded into register R1:`
			`// +-------+ R0`
			`// \| x1 x0 \|`
			`// +-------+`
			`// +-------+ R1`
			`// \| c1 c0 \| compute two MACs: z(0)+=x0c0, and z(1)+=x1c0`
			`// +-------+`
			`// Now load x2 into lo half of R0, and compute the next two MACs:`
			`// +-------+ R0`
			`// \| x1 x2 \|`
			`// +-------+`
			`// +-------+ R1`
			`// \| c1 c0 \| compute z(0)+=x1c1 and z(1)+=x2c1 (c0 not used)`
			`// +-------+`
			`// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:`
			`// +-------+ R0`
			`// \| x3 x2 \|`
			`// +-------+`
			`// +-------+ R2`
			`// \| c3 c2 \| compute z(0)+=x2c2 and z(1)+=x3c2 (c3 not used)`
			`// +-------+`
			`// Load x4 into low half of R0:`
			`// +-------+ R0`
			`// \| x3 x4 \|`
			`// +-------+`
			`// +-------+ R1`
			`// \| c3 c2 \| compute z(0)+=x3c3 and z(1)+=x4c3 (c2 not used)`
			`// +-------+`
			`// //This is a reference FIR function used to test: */`
			`//void firf (float input[], float output[], float coeffs[],`
			`// long input_size, long coeffs_size)`
			`//{`
			`// long i, k;`
			`// for(i=0; i< input_size; i++){`
			`// output[i] = 0;`
			`// for(k=0; k < coeffs_size; k++)`
			`// output[i] += input[k+i] * coeffs[k];`
			`// }`
			`//}`

			`.include "testutils.inc"`
			`start`


			`R0 = 0; R1 = 0; R2 = 0;`
			`P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2`
			`P2 = 64 (X);`

			`// P0 holds pointer to input data in one memory`
			`// bank. Increments by 2 after each inner-loop iter`
			`loadsym P0, input;`

			`// Pointer to coeffs in alternate memory bank.`
			`loadsym I1, coef;`

			`// Pointer to outputs in any memory bank.`
			`loadsym I2, output;`

			`// Setup outer do-loop for M/2 iterations`
			`// (2 outputs are computed per pass)`

			`LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;`

			`L$0:`
			`loadsym I1, coef;`
			`I0 = P0;`
			`// Set-up inner do-loop for L/2 iterations`
			`// (2 MACs are computed per pass)`

			`LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;`

			`// Load first two data elements in r0,`
			`// and two coeffs into r1:`

			`R0.L = W [ I0 ++ ];`
			`A1 = A0 = 0 \|\| R0.H = W [ I0 ++ ] \|\| R1 = [ I1 ++ ];`

			`L$1:`
			`A1 += R0.H * R1.L, A0 += R0.L * R1.L \|\| R0.L = W [ I0 ++ ] \|\| NOP;`
			`L$1end:`
			`A1 += R0.L * R1.H, A0 += R0.H * R1.H \|\| R0.H = W [ I0 ++ ] \|\| R1 = [ I1 ++ ];`

			`// Line 1: do 2 MACs and load next data element into RL0.`
			`// Line 2: do 2 MACs, load next data element into RH0,`
			`// and load next 2 coeffs`

			`R0.H = A1, R0.L = A0;`

			`// advance data pointer by 2 16b elements`
			`P0 += 4;`

			`L$0end:`
			`[ I2 ++ ] = R0; // store 2 outputs`

			`// Check results`
			`loadsym I2, output;`

			`R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );`
			`R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );`
			`R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );`
			`R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );`
			`R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );`
			`pass`

			`.data`
			`input:`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x4000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.dw 0x0000`
			`.space ((128-10)*2); // must pad with zeros or uninitialized values.`

			`.data`
			`coef:`
			`.dw 0x1000`
			`.dw 0x2000`
			`.dw 0x4000`
			`.dw 0x2000`
			`.dw 0x1000`
			`.dw 0x0000`
			`.space ((64-6)*2); // must pad with zeros or uninitialized values.`

			`.data`
			`output:`
			`.space (128*4)`