/*
* 32bit FIR filter implementation.
*
* function prototype
*	void fract32_fir(TFIR32 *filter, fract32 in[], fract32 out[], int count );
*
* parameters
*   FP+20	-	int count
*   FP+16	R2	fract32 * out
*	FP+12	R1	fract32 * in
*	FP+ 8	R0	TFIR32 * filter
*
* return
*	none
*
* side effect
*   out[] : obtain output data
*   filter->head : the head of sample input is upldated.
*
* TFIR32 structure
*   typedef struct
*   {
*		int taps;
*		fract32 * coeff;
*		fract32 * delayline;
*		fract32 * head;
*	}TFIR32;

* register layout
*   P0 : filter
*   P1 : in
*   P2 : out
*	P3 : count : loop counter's initial value
*	P4 : taps : loop counter's initial value
*   I0 : coeff <- points the coeff sample
*   B0 : coeff
*   L0 : taps*4
*   I1 : head  <- points the delayline head and sample point
*   B1 : delayline
*   L1 : taps*4
*   R0 : *coeff
*   R1 : *delay
*	R3 : input sample
*	R4 : output sample
*	R5 : taps
*/

	.text
	.align 4
	.global _fract32_fir;
	.type _fract32_fir, STT_FUNC;

_fract32_fir:
	link	0;
	[--sp] = (r7:4, p5:3);		// save all preserved register

		/* Set up registers */
	p0 = r0;					// pointer to filter
	p1 = r1;					// pointer to in
	p2 = r2;					// pointer to out
	r5 = [p0];					// taps
	p4 = r5;					// taps
	r3 = r5 << 2;				// taps*4
	l0 = r3;					// taps*4
	l1 = r3;					// taps*4
	r7 = [p0+4];				// coeff
	b0 = r7;
	i0 = b0;					// coeff
	r7 = [p0+8];
	b1 = r7;					// delayline
	r7 = [p0+12];
	i1 = r7;					// head
	p3 = [fp+20];				// load count

		/* outer loop */
	loop count lc0 = p3;
	loop_begin count;

		r3 = [p1++];				// load input sample
		i1 -= 4;					// step back the pointer
		[i1] = r3;					// push input sample into delayline

		a0 = a1 = 0;				// clear accumulator

			/* inner loop */
		loop taps lc1 = p4;
		loop_begin taps;

			r0 = [i0++];				// load coeff
			r1 = [i1++];				// load delaydata

			a1 = r1.L * r0.L (fu);
			a1 = a1 >> 16;
			a0 += r1.H * r0.H, a1 += r1.H * r0.L (m);
			a1 += r0.H * r1.L (m);
			a1 = a1 >>> 15;
			a0 += a1;

		loop_end taps;
			/* end of inner loop */

		r4 = a0;

		[p2++] = r4;				// store output sample

	loop_end count;
		/* end of outer loop */

	r7 = i1;
	[p0+12] = r7;				// update filter->head
	l0 = 0;						// restore all lx register by zero;
	l1 = l0;
	l2 = l0;
	l3 = l0;
	(r7:4, p5:3) = [sp++];		// restore all preserved register
	unlink;
	rts;
	.size	_fract32_fir, .-_fract32_fir
