/* 
 *  quant_x86.S
 *
 *     Copyright (C) James Bowman - May 2000
 *     Copyright (C) Peter Schlaile - Jan 2001
 *
 *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
 *  codec.
 *
 *  libdv is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser Public License as published by
 *  the Free Software Foundation; either version 2.1, or (at your
 *  option) any later version.
 *   
 *  libdv is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser Public License for more details.
 *   
 *  You should have received a copy of the GNU Lesser Public License
 *  along with libdv; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 *  The libdv homepage is http://libdv.sourceforge.net/.  
 */

.section .note.GNU-stack, "", @progbits
.previous

/*

The pattern for dv_88_areas looks like this:	
    0  0  1  1  1  2  2   
 0  0  1  1  1  2  2  2   
 0  1  1  1  2  2  2  3   
 1  1  1  2  2  2  3  3   
 1  1  2  2  2  3  3  3   
 1  2  2  2  3  3  3  3   
 2  2  2  3  3  3  3  3   
 2  2  3  3  3  3  3  3

Note
[1] matrix element [0][0] is untouched.
[2] all values in the same diagonal are equal
	
This implementation works by loading the four shift values in turn,
and shifting all the appropriate array elements.
	
*/

#include "asmoff.h"
	
/*
void _dv_quant_88_inverse(dv_coeff_t *block,int qno,int class)
*/

.text
	.align	4
.globl	_dv_quant_88_inverse_x86
.hidden	_dv_quant_88_inverse_x86
.type	_dv_quant_88_inverse_x86,@function
_dv_quant_88_inverse_x86:
	pushl	%ebx
	pushl	%esi

#define ARGn(N)  (12+(4*(N)))(%esp)

	/*
		eax	
		ebx	extra
		ecx
		edx	pq
		esi	block
	*/
	
	/*  pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
	movl	ARGn(1),%eax	/* qno */
	movl	ARGn(2),%ebx	/* class */
	movzbl	dv_quant_offset(%ebx),%ecx
	addl	%ecx,%eax
	leal	dv_quant_shifts(,%eax,4),%edx	/* edx is pq */

	/* extra = (class == 3); */
				/*  0   1   2   3 */
	subl	$3,%ebx		/* -3  -2  -1   0 */
	sarl	$31,%ebx	/* -1  -1  -1   0 */
	incl	%ebx		/*  0   0   0   1 */
	addl	$DV_WEIGHT_BIAS,%ebx
				/* ebx is extra */

	movl	ARGn(0),%esi	/* esi is block */

	/* Pick up each of the factors into %ecx, then shift the
	appropriate coefficients.  The pattern here is taken from
	dv_88_areas; it's arranged by diagonals for clarity. */

#define ADDR(row,col)	(2*(8*row+col))(%esi)
#define MSHIFT(row,col)	\
	shlw	%cl,ADDR(row,col)

	movl	$DV_WEIGHT_BIAS,%ecx
	MSHIFT(0,0)
	
	/* 0 */
	movzbl	(%edx),%ecx
	addl	%ebx,%ecx
	MSHIFT(0,1)
	MSHIFT(1,0)
	
	MSHIFT(0,2)
	MSHIFT(1,1)
	MSHIFT(2,0)

	/* 1 */	
	movzbl	1(%edx),%ecx
	addl	%ebx,%ecx
	MSHIFT(0,3)
	MSHIFT(1,2)
	MSHIFT(2,1)
	MSHIFT(3,0)

	MSHIFT(0,4)
	MSHIFT(1,3)
	MSHIFT(2,2)
	MSHIFT(3,1)
	MSHIFT(4,0)

	MSHIFT(0,5)
	MSHIFT(1,4)
	MSHIFT(2,3)
	MSHIFT(3,2)
	MSHIFT(4,1)
	MSHIFT(5,0)
		
	/* 2 */	
	movzbl	2(%edx),%ecx
	addl	%ebx,%ecx
	MSHIFT(0,6)
	MSHIFT(1,5)
	MSHIFT(2,4)
	MSHIFT(3,3)
	MSHIFT(4,2)
	MSHIFT(5,1)
	MSHIFT(6,0)
	
	MSHIFT(0,7)
	MSHIFT(1,6)
	MSHIFT(2,5)
	MSHIFT(3,4)
	MSHIFT(4,3)
	MSHIFT(5,2)
	MSHIFT(6,1)
	MSHIFT(7,0)
	
	MSHIFT(1,7)
	MSHIFT(2,6)
	MSHIFT(3,5)
	MSHIFT(4,4)
	MSHIFT(5,3)
	MSHIFT(6,2)
	MSHIFT(7,1)

	/* 3 */	
	movzbl	3(%edx),%ecx
	addl	%ebx,%ecx
	MSHIFT(2,7)
	MSHIFT(3,6)
	MSHIFT(4,5)
	MSHIFT(5,4)
	MSHIFT(6,3)
	MSHIFT(7,2)
	
	MSHIFT(3,7)
	MSHIFT(4,6)
	MSHIFT(5,5)
	MSHIFT(6,4)
	MSHIFT(7,3)
	
	MSHIFT(4,7)
	MSHIFT(5,6)
	MSHIFT(6,5)
	MSHIFT(7,4)
	
	MSHIFT(5,7)
	MSHIFT(6,6)
	MSHIFT(7,5)
	
	MSHIFT(6,7)
	MSHIFT(7,6)
	
	MSHIFT(7,7)
		
#undef ARGn
	popl	%esi
	popl	%ebx
	ret

	.align	4
.globl	_dv_quant_x86
.hidden	_dv_quant_x86
.type	_dv_quant_x86,@function
_dv_quant_x86:
	pushl	%ebx
	pushl	%ecx
	pushl	%edx	
	pushl	%esi
	

#define ARGn(N)  (20+(4*(N)))(%esp)

	/*
		eax	
		ebx	extra
		ecx
		edx	pq
		esi	block
	*/
	
	/*  pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
	movl	ARGn(1),%eax	/* qno */
	movl	ARGn(2),%ebx	/* class */

	movzbl	dv_quant_offset(%ebx),%ecx
	addl	%ecx,%eax
	leal	dv_quant_shifts(,%eax,4),%edx	/* edx is pq */

	/* extra = (class == 3); */
				/*  0   1   2   3 */
	subl	$3,%ebx		/* -3  -2  -1   0 */
	sarl	$31,%ebx	/* -1  -1  -1   0 */
	incl	%ebx		/*  0   0   0   1 */
				/* ebx is extra */

	movl	ARGn(0),%esi	/* esi is block */

	/* Since we already reordered the coefficients, it's easy:
	   Shift between OFS0 and OFS1 with the first pq value
	         between OFS1 and OFS2 with the second pq value etc.
	   Since we really want to divide, we have to compensate for
	   negative values.

	   The remaining thing is pipe-line optimization
	   which results in obfuscating MMX code...
	 */

	# sarw	%cl,ADDR(row,col)

#define OFS0 (1)
#define OFS1 (1+2+3)
#define OFS2 (1+2+3+4+5+6)
#define OFS3 (1+2+3+4+5+6+7+8+7)

	/* 0 */                     
	movzbl	(%edx),%ecx
	movq	OFS0*2(%esi), %mm2
	addl	%ebx, %ecx
	movq	(OFS0+4)*2(%esi), %mm4
	movd	%ecx, %mm7
	
	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	(OFS1*2)(%esi), %mm2
	movq	%mm3, OFS0*2(%esi)
	movq	%mm5, (OFS0+4)*2(%esi)
	
	/* 1 */	
	movzbl	1(%edx),%ecx
	movq	(OFS1+4)*2(%esi), %mm4
	addl	%ebx, %ecx
	movq	%mm2, %mm3
	movd	%ecx, %mm7
	movq	%mm4, %mm5
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	%mm3, OFS1*2(%esi)
	movq	(OFS1+8)*2(%esi), %mm2
	movq	%mm5, (OFS1+4)*2(%esi)
	movq	(OFS1+12)*2(%esi), %mm4	
	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	OFS2*2(%esi), %mm0
	movq	%mm3, (OFS1+8)*2(%esi)
	movq	%mm5, (OFS1+12)*2(%esi)
	
	/* 2 */	
	movzbl	2(%edx),%ecx
	movq	(OFS2+4)*2(%esi), %mm2
	addl	%ebx, %ecx
	movq	(OFS2+8)*2(%esi), %mm4
	movd	%ecx, %mm7

	movq	%mm0, %mm1
	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm0
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm1
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	%mm1, OFS2*2(%esi)
	movq	%mm3, (OFS2+4)*2(%esi)
	movq	%mm5, (OFS2+8)*2(%esi)
	
	movq	(OFS2+12)*2(%esi), %mm0
	movq	(OFS2+16)*2(%esi), %mm2
	movq	(OFS2+20)*2(%esi), %mm4

	movq	%mm0, %mm1
	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm0
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm1
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5

	movq	OFS3*2(%esi), %mm0

	movq	%mm1, (OFS2+12)*2(%esi)
	movq	%mm3, (OFS2+16)*2(%esi)
	movq	%mm5, (OFS2+20)*2(%esi)

	/* 3 */	
	movzbl	3(%edx),%ecx
	movq	(OFS3+4)*2(%esi), %mm2
	addl	%ebx, %ecx
	movq	(OFS3+8)*2(%esi), %mm4
	movd	%ecx, %mm7

	movq	%mm0, %mm1
	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm0
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm1
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm0, %mm1
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm0, %mm1
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	%mm1, OFS3*2(%esi)
	movq	%mm3, (OFS3+4)*2(%esi)
	movq	%mm5, (OFS3+8)*2(%esi)

	movq	(OFS3+12)*2(%esi), %mm2
	movq	(OFS3+16)*2(%esi), %mm4

	movq	%mm2, %mm3
	movq	%mm4, %mm5
	psraw	$0x0f, %mm2
	psraw	$0x0f, %mm4
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	psraw	%mm7, %mm3
	psraw	%mm7, %mm5
	pxor	%mm2, %mm3
	pxor	%mm4, %mm5
	psubw	%mm2, %mm3
	psubw	%mm4, %mm5
	
	movq	%mm3, (OFS3+12)*2(%esi)
	movq	%mm5, (OFS3+16)*2(%esi)

	movw	(OFS3+20)*2(%esi), %ax
	movw	%ax, %bx
	sarw	$0xf, %bx
	xorw	%bx, %ax
	subw	%bx, %ax
	sarw	%cl, %ax
	xorw	%bx, %ax
	subw	%bx, %ax
	movw	%ax, (OFS3+20)*2(%esi)
	
	popl	%esi
	popl	%edx
	popl	%ecx
	popl	%ebx
	ret