/* FrameReconstructor_SSE2.c */
/* 2009/07/02                */

#include "StdAfx.h"

#include "FrameReconstructor.h"

#include "MotionComp_SSE2.h"

/* */

#pragma warning(disable : 4799)

/* */

static __inline void Transpose_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
	__m128i u0, u1, u2, u3, u4, u5, u6, u7;

	t0 = _mm_loadu_si128(X + 0);
	t1 = _mm_loadu_si128(X + 1);
	t2 = _mm_loadu_si128(X + 2);
	t3 = _mm_loadu_si128(X + 3);
	t4 = _mm_loadu_si128(X + 4);
	t5 = _mm_loadu_si128(X + 5);
	t6 = _mm_loadu_si128(X + 6);
	t7 = _mm_loadu_si128(X + 7);

	u0 = _mm_unpacklo_epi16(t0, t1);
	u1 = _mm_unpackhi_epi16(t0, t1);
	u2 = _mm_unpacklo_epi16(t2, t3);
	u3 = _mm_unpackhi_epi16(t2, t3);
	u4 = _mm_unpacklo_epi16(t4, t5);
	u5 = _mm_unpackhi_epi16(t4, t5);
	u6 = _mm_unpacklo_epi16(t6, t7);
	u7 = _mm_unpackhi_epi16(t6, t7);

	t0 = _mm_unpacklo_epi32(u0, u2);
	t1 = _mm_unpacklo_epi32(u1, u3);
	t2 = _mm_unpackhi_epi32(u0, u2);
	t3 = _mm_unpackhi_epi32(u1, u3);
	t4 = _mm_unpacklo_epi32(u4, u6);
	t5 = _mm_unpacklo_epi32(u5, u7);
	t6 = _mm_unpackhi_epi32(u4, u6);
	t7 = _mm_unpackhi_epi32(u5, u7);

	Y[0] = _mm_unpacklo_epi64(t0, t4);
	Y[1] = _mm_unpackhi_epi64(t0, t4);
	Y[2] = _mm_unpacklo_epi64(t2, t6);
	Y[3] = _mm_unpackhi_epi64(t2, t6);
	Y[4] = _mm_unpacklo_epi64(t1, t5);
	Y[5] = _mm_unpackhi_epi64(t1, t5);
	Y[6] = _mm_unpacklo_epi64(t3, t7);
	Y[7] = _mm_unpackhi_epi64(t3, t7);
}

void QT_UpdateDequantizeMatrix_SSE2(
	FrameDecoder_t* t)
{
	FrameReconstructor_SSE2_t* r = t->Reconstructor;

	INT32 q, i, p;

	for (q = 0; q < t->Header.NQIS; q++) {
		for (i = 0; i < 2; i++) {
			for (p = 0; p < 3; p++) {
				const INT16* x = t->Dequantize[q].Matrix[i][p];
				INT16*       y = r->Matrix[q][i][p];
				Transpose_SSE2(x, y);
			}
		}
	}
}

/* */

static __inline void Block_CopyPlane8x8_SSE2(
	Plane_t* p,
	INT32    x,
	INT32    y,
	Plane_t* r)
{
	const UINT8* s = r->Plane + y * r->Pitch + x;
	UINT8*       d = p->Plane + y * p->Pitch + x;

	__m64 s0, s1, s2, s3, s4, s5, s6, s7;

	s0 = *((const __m64*)s); s += r->Pitch;
	s1 = *((const __m64*)s); s += r->Pitch;
	s2 = *((const __m64*)s); s += r->Pitch;
	s3 = *((const __m64*)s); s += r->Pitch;
	s4 = *((const __m64*)s); s += r->Pitch;
	s5 = *((const __m64*)s); s += r->Pitch;
	s6 = *((const __m64*)s); s += r->Pitch;
	s7 = *((const __m64*)s);

	*((__m64*)d) = s0; d += p->Pitch;
	*((__m64*)d) = s1; d += p->Pitch;
	*((__m64*)d) = s2; d += p->Pitch;
	*((__m64*)d) = s3; d += p->Pitch;
	*((__m64*)d) = s4; d += p->Pitch;
	*((__m64*)d) = s5; d += p->Pitch;
	*((__m64*)d) = s6; d += p->Pitch;
	*((__m64*)d) = s7;
}

static __inline void Block_CopyPlane16x16_SSE2(
	Plane_t* p,
	INT32    x,
	INT32    y,
	Plane_t* r)
{
	const UINT8* s = r->Plane + y * r->Pitch + x;
	UINT8*       d = p->Plane + y * p->Pitch + x;

	__m128i s0, s1, s2, s3, s4, s5, s6, s7;

	s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s7 = _mm_load_si128((const __m128i*)s); s += r->Pitch;

	_mm_store_si128((__m128i*)d, s0); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s1); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s2); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s3); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s4); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s5); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s6); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s7); d += p->Pitch;

	s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
	s7 = _mm_load_si128((const __m128i*)s);

	_mm_store_si128((__m128i*)d, s0); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s1); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s2); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s3); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s4); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s5); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s6); d += p->Pitch;
	_mm_store_si128((__m128i*)d, s7);
}

/* */

ALIGN(0x10) static const UINT16 IPRED[8] = {
	128, 128, 128, 128, 128, 128, 128, 128
};

static __inline void Block_CopyIntra8x8_SSE2(
	Plane_t*     p,
	INT32        x,
	INT32        y,
	const INT16* c)
{
	UINT8* d = p->Plane + y * p->Pitch + x;

	const __m128i* B = (const __m128i*)IPRED;
	const __m128i* C = (const __m128i*)c;

	__m128i s0, s1, s2, s3;
	const __m128i z = _mm_setzero_si128();

	s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], B[0]), z);
	s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], B[0]), z);
	s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], B[0]), z);
	s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], B[0]), z);

	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;

	s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], B[0]), z);
	s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], B[0]), z);
	s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], B[0]), z);
	s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], B[0]), z);

	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s3);
}

static __inline void Block_ReviseInter8x8_SSE2(
	Plane_t*     p,
	INT32        x,
	INT32        y,
	const INT16* c)
{
	UINT8*       d = p->Plane + y * p->Pitch + x;
	const UINT8* s = d;

	const __m128i* C = (const __m128i*)c;

	__m128i b0, b1, b2, b3;
	__m128i s0, s1, s2, s3;
	const __m128i z = _mm_setzero_si128();

	b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;

	s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], b0), z);
	s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], b1), z);
	s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], b2), z);
	s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], b3), z);

	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;

	b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
	b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z);

	s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], b0), z);
	s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], b1), z);
	s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], b2), z);
	s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], b3), z);

	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
	_mm_storel_epi64((__m128i*)d, s3);
}

/* */

ALIGN(0x10) static const UINT16 COS[8][8] = {
	{     8,     8,     8,     8,     8,     8,     8,     8 }, /* 0 */
	{ 64277, 64277, 64277, 64277, 64277, 64277, 64277, 64277 }, /* 1 */
	{ 60547, 60547, 60547, 60547, 60547, 60547, 60547, 60547 }, /* 2 */
	{ 54491, 54491, 54491, 54491, 54491, 54491, 54491, 54491 }, /* 3 */
	{ 46341, 46341, 46341, 46341, 46341, 46341, 46341, 46341 }, /* 4 */
	{ 36410, 36410, 36410, 36410, 36410, 36410, 36410, 36410 }, /* 5 */
	{ 25080, 25080, 25080, 25080, 25080, 25080, 25080, 25080 }, /* 6 */
	{ 12785, 12785, 12785, 12785, 12785, 12785, 12785, 12785 }, /* 7 */
};

#define MUL1(T,X) _mm_add_epi16(_mm_mulhi_epi16(X, C[T]), X)
#define MUL0(T,X) _mm_mulhi_epi16(X, C[T])

static __inline void IDCT_R_8_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* C = (const __m128i*)COS[0];
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i s0;
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	/* Stage.1 */

	s0 = _mm_add_epi16(X[0], X[4]);
	t0 = MUL1(4, s0);

	s0 = _mm_sub_epi16(X[0], X[4]);
	t1 = MUL1(4, s0);

	t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
	t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));

	t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
	t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));

	t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
	t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));

	/* Stage.2 */

	s0 = _mm_sub_epi16(t4, t5);
	t4 = _mm_add_epi16(t4, t5);
	t5 = MUL1(4, s0);

	s0 = _mm_sub_epi16(t7, t6);
	t7 = _mm_add_epi16(t7, t6);
	t6 = MUL1(4, s0);

	/* Stage.3 */

	s0 = _mm_sub_epi16(t0, t3);
	t0 = _mm_add_epi16(t0, t3);

	t3 = _mm_sub_epi16(t1, t2);
	t1 = _mm_add_epi16(t1, t2);

	t2 = _mm_sub_epi16(t6, t5);
	t6 = _mm_add_epi16(t6, t5);

	/* Stage.4 */

	Y[0] = _mm_add_epi16(t0, t7);
	Y[1] = _mm_add_epi16(t1, t6);
	Y[2] = _mm_add_epi16(t3, t2);
	Y[3] = _mm_add_epi16(s0, t4);
	Y[4] = _mm_sub_epi16(s0, t4);
	Y[5] = _mm_sub_epi16(t3, t2);
	Y[6] = _mm_sub_epi16(t1, t6);
	Y[7] = _mm_sub_epi16(t0, t7);
}

static __inline void IDCT_R_8_4_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* C = (const __m128i*)COS[0];
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i s0;
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	/* Stage.1 */

	t1 = t0 = MUL1(4, X[0]);

	t2 = MUL0(6, X[2]);
	t3 = MUL1(2, X[2]);

	t4 = MUL0(7, X[1]);
	t5 = _mm_sub_epi16(_mm_setzero_si128(), MUL1(5, X[3]));

	t6 = MUL1(3, X[3]);
	t7 = MUL1(1, X[1]);

	/* Stage.2 */

	s0 = _mm_sub_epi16(t4, t5);
	t4 = _mm_add_epi16(t4, t5);
	t5 = MUL1(4, s0);

	s0 = _mm_sub_epi16(t7, t6);
	t7 = _mm_add_epi16(t7, t6);
	t6 = MUL1(4, s0);

	/* Stage.3 */

	s0 = _mm_sub_epi16(t0, t3);
	t0 = _mm_add_epi16(t0, t3);

	t3 = _mm_sub_epi16(t1, t2);
	t1 = _mm_add_epi16(t1, t2);

	t2 = _mm_sub_epi16(t6, t5);
	t6 = _mm_add_epi16(t6, t5);

	/* Stage.4 */

	Y[0] = _mm_add_epi16(t0, t7);
	Y[1] = _mm_add_epi16(t1, t6);
	Y[2] = _mm_add_epi16(t3, t2);
	Y[3] = _mm_add_epi16(s0, t4);
	Y[4] = _mm_sub_epi16(s0, t4);
	Y[5] = _mm_sub_epi16(t3, t2);
	Y[6] = _mm_sub_epi16(t1, t6);
	Y[7] = _mm_sub_epi16(t0, t7);
}

static __inline void IDCT_C_8_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* C = (const __m128i*)COS[0];
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i s0;
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	/* Stage.1 */

	s0 = _mm_add_epi16(X[0], X[4]);
	t0 = MUL1(4, s0);

	s0 = _mm_sub_epi16(X[0], X[4]);
	t1 = MUL1(4, s0);

	t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
	t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));

	t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
	t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));

	t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
	t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));

	/* Stage.2 */

	s0 = _mm_sub_epi16(t4, t5);
	t4 = _mm_add_epi16(t4, t5);
	t5 = MUL1(4, s0);

	s0 = _mm_sub_epi16(t7, t6);
	t7 = _mm_add_epi16(t7, t6);
	t6 = MUL1(4, s0);

	/* Stage.3 */

	s0 = _mm_sub_epi16(t0, t3);
	t0 = _mm_add_epi16(t0, t3);

	t3 = _mm_sub_epi16(t1, t2);
	t1 = _mm_add_epi16(t1, t2);

	t2 = _mm_sub_epi16(t6, t5);
	t6 = _mm_add_epi16(t6, t5);

	/* Stage.4 */

	Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4);
	Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4);
	Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4);
	Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4);
	Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4);
	Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4);
	Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4);
	Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4);
}

static __inline void IDCT_C_8_4_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* C = (const __m128i*)COS[0];
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i s0;
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	/* Stage.1 */

	t1 = t0 = MUL1(4, X[0]);

	t2 = MUL0(6, X[2]);
	t3 = MUL1(2, X[2]);

	t4 = MUL0(7, X[1]);
	t5 = _mm_sub_epi16(_mm_setzero_si128(), MUL1(5, X[3]));

	t6 = MUL1(3, X[3]);
	t7 = MUL1(1, X[1]);

	/* Stage.2 */

	s0 = _mm_sub_epi16(t4, t5);
	t4 = _mm_add_epi16(t4, t5);
	t5 = MUL1(4, s0);

	s0 = _mm_sub_epi16(t7, t6);
	t7 = _mm_add_epi16(t7, t6);
	t6 = MUL1(4, s0);

	/* Stage.3 */

	s0 = _mm_sub_epi16(t0, t3);
	t0 = _mm_add_epi16(t0, t3);

	t3 = _mm_sub_epi16(t1, t2);
	t1 = _mm_add_epi16(t1, t2);

	t2 = _mm_sub_epi16(t6, t5);
	t6 = _mm_add_epi16(t6, t5);

	/* Stage.4 */

	Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4);
	Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4);
	Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4);
	Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4);
	Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4);
	Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4);
	Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4);
	Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4);
}

static __inline void Transpose_U_SSE2(
	const INT16* x,
	INT16*       y)
{
	const __m128i* X = (const __m128i*)x;
	__m128i*       Y = (__m128i*)y;

	__m128i u0, u1, u2, u3, u4, u5, u6, u7;
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	u0 = _mm_unpacklo_epi16(X[0], X[1]);
	u1 = _mm_unpackhi_epi16(X[0], X[1]);
	u2 = _mm_unpacklo_epi16(X[2], X[3]);
	u3 = _mm_unpackhi_epi16(X[2], X[3]);
	u4 = _mm_unpacklo_epi16(X[4], X[5]);
	u5 = _mm_unpackhi_epi16(X[4], X[5]);
	u6 = _mm_unpacklo_epi16(X[6], X[7]);
	u7 = _mm_unpackhi_epi16(X[6], X[7]);

	t0 = _mm_unpacklo_epi32(u0, u2);
	t1 = _mm_unpacklo_epi32(u1, u3);
	t2 = _mm_unpackhi_epi32(u0, u2);
	t3 = _mm_unpackhi_epi32(u1, u3);
	t4 = _mm_unpacklo_epi32(u4, u6);
	t5 = _mm_unpacklo_epi32(u5, u7);
	t6 = _mm_unpackhi_epi32(u4, u6);
	t7 = _mm_unpackhi_epi32(u5, u7);

	Y[0] = _mm_unpacklo_epi64(t0, t4);
	Y[1] = _mm_unpackhi_epi64(t0, t4);
	Y[2] = _mm_unpacklo_epi64(t2, t6);
	Y[3] = _mm_unpackhi_epi64(t2, t6);
	Y[4] = _mm_unpacklo_epi64(t1, t5);
	Y[5] = _mm_unpackhi_epi64(t1, t5);
	Y[6] = _mm_unpacklo_epi64(t3, t7);
	Y[7] = _mm_unpackhi_epi64(t3, t7);
}

/* */

static const UINT8 TZZ[64] = {
	 0,  2,  3,  9, 10, 20, 21, 35,
	 1,  4,  8, 11, 19, 22, 34, 36,
	 5,  7, 12, 18, 23, 33, 37, 48,
	 6, 13, 17, 24, 32, 38, 47, 49,
	14, 16, 25, 31, 39, 46, 50, 57,
	15, 26, 30, 40, 45, 51, 56, 58,
	27, 29, 41, 44, 52, 55, 59, 62,
	28, 42, 43, 53, 54, 60, 61, 63
};

static __inline void DequantizeIDCT8x8_SSE2(
	const INT16* block,
	const INT16* matrix,
	INT16*       coeff)
{
	ALIGN(0x10) INT16 c0[64];

	{ /* Reorder */
		const UINT8* t = TZZ;

		INT16* c = c0;
		INT16* e = c + 64;
		for (; c < e; c += 8, t += 8) {
			c[0] = block[t[0]];
			c[1] = block[t[1]];
			c[2] = block[t[2]];
			c[3] = block[t[3]];
			c[4] = block[t[4]];
			c[5] = block[t[5]];
			c[6] = block[t[6]];
			c[7] = block[t[7]];
		}
	}

	{ /* Dequantize */
		const __m128i* m = (const __m128i*)matrix;
		__m128i*       d = (__m128i*)c0;

		d[0] = _mm_mullo_epi16(d[0], m[0]);
		d[1] = _mm_mullo_epi16(d[1], m[1]);
		d[2] = _mm_mullo_epi16(d[2], m[2]);
		d[3] = _mm_mullo_epi16(d[3], m[3]);
		d[4] = _mm_mullo_epi16(d[4], m[4]);
		d[5] = _mm_mullo_epi16(d[5], m[5]);
		d[6] = _mm_mullo_epi16(d[6], m[6]);
		d[7] = _mm_mullo_epi16(d[7], m[7]);
	}

	/* iDCT Row */
	IDCT_R_8_SSE2(c0, coeff);

	/* Transpose */
	Transpose_U_SSE2(coeff, c0);

	/* iDCT Colum */
	IDCT_C_8_SSE2(c0, coeff);
}

/* */

static __inline void DequantizeIDCT8x8_16_SSE2(
	const INT16* block,
	const INT16* matrix,
	INT16*       coeff)
{
	ALIGN(0x10) INT16 c0[64];

	const __m128i z = _mm_setzero_si128();

	_mm_store_si128((__m128i*)(c0 + 0x00), z);
	_mm_store_si128((__m128i*)(c0 + 0x08), z);
	_mm_store_si128((__m128i*)(c0 + 0x10), z);
	_mm_store_si128((__m128i*)(c0 + 0x18), z);

	/* Reorder */
	c0[ 0 + 0] = block[TZZ[ 0 + 0]];
	c0[ 0 + 1] = block[TZZ[ 0 + 1]];
	c0[ 0 + 2] = block[TZZ[ 0 + 2]];
	c0[ 0 + 3] = block[TZZ[ 0 + 3]];

	c0[ 8 + 0] = block[TZZ[ 8 + 0]];
	c0[ 8 + 1] = block[TZZ[ 8 + 1]];
	c0[ 8 + 2] = block[TZZ[ 8 + 2]];

	c0[16 + 0] = block[TZZ[16 + 0]];
	c0[16 + 1] = block[TZZ[16 + 1]];

	c0[24 + 0] = block[TZZ[24 + 0]];

	{ /* Dequantize */
		const __m64* m = (const __m64*)matrix;
		__m64*       d = (__m64*)c0;

		d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
		d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
		d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
		d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
	}

	/* iDCT Row */
	IDCT_R_8_4_SSE2(c0, coeff);

	/* Transpose */
	Transpose_U_SSE2(coeff, c0);

	/* iDCT Colum */
	IDCT_C_8_4_SSE2(c0, coeff);
}

/* */

static __inline void DequantizeIDCT8x8_0_SSE2(
	INT16        dc,
	const INT16* matrix,
	INT16*       coeff)
{
	__m64   d0 = _mm_set1_pi16(((dc * matrix[0]) + 15) >> 5);
	__m128i d1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(d0), _mm_movpi64_epi64(d0));

	_mm_store_si128((__m128i*)(coeff + 0 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 1 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 2 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 3 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 4 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 5 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 6 * 8), d1);
	_mm_store_si128((__m128i*)(coeff + 7 * 8), d1);
}

/* */

struct DecodeCoefficientsLeaf {

	INT32 EOB_Run;

	INT8*  Run;
	INT16* Coeff;

}; /* DecodeCoefficientsLeaf */

typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;

struct DecodeCoefficientsContext {

	DecodeCoefficientsLeaf_t Leaf[64];

}; /* DecodeCoefficientsContext */

typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;

static INT32 DecodeCoefficients_SSE2(
	FrameDecoder_t*              t,
	DecodeCoefficientsContext_t* ctx,
	INT16*                       block)
{
	INT16* b = block;
	INT16* e = b + 64;

	DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;

	const __m128i z = _mm_setzero_si128();

	_mm_store_si128((__m128i*)(block + 0x00), z);
	_mm_store_si128((__m128i*)(block + 0x08), z);
	_mm_store_si128((__m128i*)(block + 0x10), z);
	_mm_store_si128((__m128i*)(block + 0x18), z);
	_mm_store_si128((__m128i*)(block + 0x20), z);
	_mm_store_si128((__m128i*)(block + 0x28), z);
	_mm_store_si128((__m128i*)(block + 0x30), z);
	_mm_store_si128((__m128i*)(block + 0x38), z);

	while (b < e) {
		if (leaf->EOB_Run > 0) {
			leaf->EOB_Run -= 1;
			break;

		} else {
			INT32 run   = *((leaf->Run  )++);
			INT32 coeff = *((leaf->Coeff)++);

			if (run < 0) {
				leaf->EOB_Run = coeff;

			} else {
				b += run;
				if (b >= e) {
					break;
				}

				*(b++) = coeff;

				leaf = ctx->Leaf + (b - block);
			}
		}
	}

	return b - block;
}

/* */

static void Reconstruct_IntraBlock(
	FrameDecoder_t*              t,
	Plane_t*                     p,
	INT32                        x,
	INT32                        y,
	INT16                        dc,
	INT32                        qi,
	INT32                        plane,
	Plane_t*                     r,
	DecodeCoefficientsContext_t* ctx)
{
	ALIGN(0x10) INT16 block[64];
	ALIGN(0x10) INT16 coeff[64];

	const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];

	INT32 cs;

	if (dc == NOT_CODED) {
		Block_CopyPlane8x8_SSE2(p, x, y, r);
		return;
	}

	cs = DecodeCoefficients_SSE2(t, ctx, block);

	if (cs > 10) {
		block[0] = dc;
		DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);

	} else if (cs > 1) {
		block[0] = dc;
		DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);

	} else {
		DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
	}

	Block_CopyIntra8x8_SSE2(p, x, y, coeff);
}

static void Reconstruct_InterBlock(
	FrameDecoder_t*              t,
	Plane_t*                     p,
	INT32                        x,
	INT32                        y,
	INT16                        dc,
	INT32                        qi,
	INT32                        plane,
	Plane_t*                     r,
	DecodeCoefficientsContext_t* ctx)
{
	ALIGN(0x10) INT16 block[64];
	ALIGN(0x10) INT16 coeff[64];

	const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];

	INT32 cs;

	if (dc == NOT_CODED) {
		if (r != NULL) {
			Block_CopyPlane8x8_SSE2(p, x, y, r);
		}
		return;
	}

	cs = DecodeCoefficients_SSE2(t, ctx, block);

	if (cs > 10) {
		block[0] = dc;
		DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);

	} else if (cs > 1) {
		block[0] = dc;
		DequantizeIDCT8x8_16_SSE2(block, mat[plane], coeff);

	} else {
		DequantizeIDCT8x8_0_SSE2(dc, mat[plane], coeff);
	}

	Block_ReviseInter8x8_SSE2(p, x, y, coeff);
}

/* */

static const INT8 S_PX[16] = {
	0*8, 1*8, 1*8, 0*8,
	0*8, 0*8, 1*8, 1*8,
	2*8, 2*8, 3*8, 3*8,
	3*8, 2*8, 2*8, 3*8
};

static const INT8 S_PY[16] = {
	0*8, 0*8, 1*8, 1*8,
	2*8, 3*8, 3*8, 2*8,
	2*8, 3*8, 3*8, 2*8,
	1*8, 1*8, 0*8, 0*8
};

static const INT8 M_PX[4] = {
	0*16, 0*16,
	1*16, 1*16
};

static const INT8 M_PY[4] = {
	0*16, 1*16,
	1*16, 0*16
};

/* */

static void Reconstruct_YPlane_SSE2(
	FrameDecoder_t* t)
{
	INT32 x, y;

	INT32 sx = t->Index->SX[0] * 32;
	INT32 sy = t->Index->SY[0] * 32;

	INT32 mx = t->Index->MX * 16;
	INT32 my = t->Index->MY * 16;

	INT32 bx = t->Index->BX[0];

	const UINT16* bi = t->Index->BIndex[0];

	Plane_t* g = t->Frame[0];
	Plane_t* p = t->Frame[1];
	Plane_t* r = t->Frame[2];

	const UINT8*          mm = t->MBMode;
	const MotionVector_t* mv = t->MV;

	const UINT8* qi = t->BQI;

	ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };

	INT32 i;
	for (i = 0; i < 64; i++) {
		ctx.Leaf[i].Run   = t->BRun  [0][i];
		ctx.Leaf[i].Coeff = t->BCoeff[0][i];
	}

	for (y = 0; y < sy; y += 32) {
		for (x = 0; x < sx; x += 32) {
			INT32 i = 0;

			INT32 m;
			for (m = 0; m < 4; m++, i += 4) {
				INT32 x0 = x + M_PX[m];
				INT32 y0 = y + M_PY[m];

				if (x0 < mx && y0 < my) {
					switch (*mm) {
					case 0: /* INTER_NOMV */
						Block_CopyPlane16x16_SSE2(p, x0, y0, r);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, NULL, &ctx);
						break;

					case 1: /* INTRA */
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 2: /* INTER_MV */
					case 3: /* INTER_MV_LAST */
					case 4: /* INTER_MV_LAST2 */
						MotionComp_Block16x16_SSE2(p, x0, y0, r, mv);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 5: /* INTER_GOLDEN_NOMV */
						Block_CopyPlane16x16_SSE2(p, x0, y0, g);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 6: /* INTER_GOLDEN_MV */
						MotionComp_Block16x16_SSE2(p, x0, y0, g, mv);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 7: /* INTER_MV_FOUR */
					{
						const MotionVector_t* v = mv;

						const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;

						if (dc[0] != NOT_CODED) {
							MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 0, r, v++);
						}

						if (dc[1] != NOT_CODED) {
							MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 0, r, v++);
						}

						if (dc[0 + bx] != NOT_CODED) {
							MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 8, r, v++);
						}

						if (dc[1 + bx] != NOT_CODED) {
							MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 8, r, v++);
						}

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;
					}

					} /* switch */

					bi += 4;
					mm += 1;
					mv += 4;
					qi += 4;
				}
			}
		}
	}
}

/* */

static void Reconstruct_CPlane_SSE2(
	FrameDecoder_t* t)
{
	INT32 x, y;

	INT32 sx = t->Index->SX[1] * 32;
	INT32 sy = t->Index->SY[1] * 32;

	INT32 mx = t->Index->MX * 8;
	INT32 my = t->Index->MY * 8;

	INT32 bx = t->Index->BX[1];

	Plane_t* g = t->Frame[0];
	Plane_t* p = t->Frame[1];
	Plane_t* r = t->Frame[2];

	const INT16* DC0 = t->DC + t->Index->BC[0];
	const INT16* DC1 = DC0   + t->Index->BC[1];

	const UINT8* m = t->BMode + t->Index->BC[0];

	const UINT8* qi0 = t->BQI + t->Index->BC[0];
	const UINT8* qi1 = qi0    + t->Index->BC[1];

	ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };

	INT32 i;
	for (i = 0; i < 64; i++) {
		ctx[0].Leaf[i].Run   = t->BRun  [1][i];
		ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];

		ctx[1].Leaf[i].Run   = t->BRun  [2][i];
		ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
	}

	for (y = 0; y < sy; y += 32) {
		for (x = 0; x < sx; x += 32) {
			INT32 i;
			for (i = 0; i < 16; i++) {
				INT32 xx = x + S_PX[i];
				INT32 yy = y + S_PY[i];

				if (xx < mx && yy < my) {
					INT32 idx = (xx >> 3) + (yy >> 3) * bx;

					switch (m[idx]) {
					case 0: /* INTER_NOMV */
						Block_CopyPlane8x8_SSE2(p + 1, xx, yy, r + 1);
						Block_CopyPlane8x8_SSE2(p + 2, xx, yy, r + 2);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, NULL, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, NULL, ctx + 1);
						break;

					case 1: /* INTRA */
						Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 2: /* INTER_MV */
					case 3: /* INTER_MV_LAST */
					case 4: /* INTER_MV_LAST2 */
						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 5: /* INTER_GOLDEN_NOMV */
						Block_CopyPlane8x8_SSE2(p + 1, xx, yy, g + 1);
						Block_CopyPlane8x8_SSE2(p + 2, xx, yy, g + 2);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 6: /* INTER_GOLDEN_MV */
						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, g + 1, t->MVC + idx);
						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, g + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 7: /* INTER_MV_FOUR */
						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					} /* switch */

					qi0++;
					qi1++;
				}
			}
		}
	}
}

/* */

struct SFilter {

	__m128i L;
	__m128i L2;

	__m128i NL;
	__m128i NL2;

}; /* SFilter */

typedef struct SFilter SFilter_t;

static void SFilter_Setup(SFilter_t* t, INT32 lim)
{
	const __m128i z = _mm_setzero_si128();

	__m128i l  = _mm_set1_epi16(lim);
	__m128i nl = _mm_sub_epi16(z, l);

	t->L   = l;
	t->L2  = _mm_slli_epi16(l, 1);
	t->NL  = nl;
	t->NL2 = _mm_slli_epi16(nl, 1);
}

ALIGN(0x10) static const UINT16 UR_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };

/* */

#if 0
static void Filter_LoopFilterH(
	const LoopFilter_t* t,
	UINT8*              b,
	INT32               s)
{
	const INT16* d = t->Delta + 127;

	INT32 p0[2];
	INT32 p1[2];

	INT32 q0[2];
	INT32 q1[2];

	UINT8* p   = b;
	UINT8* end = p + s * 8;

	p0[1] = 0;
	p1[1] = 0;
	q0[1] = 255;
	q1[1] = 255;

	for (; p < end; p += s) {
		INT32 x = (p[-2] - p[1]) + 3 * (p[0] - p[-1]);
		INT32 v = d[(x + 4) >> 3];

		p0[0] = p[-1] + v;
		p1[0] = p[ 0] - v;

		q0[0] = p0[(p0[0] < 0)];
		q1[0] = p1[(p1[0] < 0)];

		p[-1] = q0[(q0[0] > 255)];
		p[ 0] = q1[(q1[0] > 255)];
	}
}
#endif

static __inline void Filter_LoopFilterH_SSE2(
	const SFilter_t* t,
	UINT8*           p,
	INT32            s)
{
	const __m64 z0 = _mm_setzero_si64();

	__m64 S00 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 0 * s))), z0);
	__m64 S01 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 1 * s))), z0);
	__m64 S02 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 2 * s))), z0);
	__m64 S03 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 3 * s))), z0);

	__m64 u00 = _mm_unpacklo_pi16(S00, S01);
	__m64 u01 = _mm_unpackhi_pi16(S00, S01);
	__m64 u02 = _mm_unpacklo_pi16(S02, S03);
	__m64 u03 = _mm_unpackhi_pi16(S02, S03);

	__m64 S10 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 4 * s))), z0);
	__m64 S11 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 5 * s))), z0);
	__m64 S12 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 6 * s))), z0);
	__m64 S13 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 7 * s))), z0);

	__m64 u10 = _mm_unpacklo_pi16(S10, S11);
	__m64 u11 = _mm_unpackhi_pi16(S10, S11);
	__m64 u12 = _mm_unpacklo_pi16(S12, S13);
	__m64 u13 = _mm_unpackhi_pi16(S12, S13);

	__m128i P0 = _mm_unpacklo_epi64(
		_mm_movpi64_epi64(_mm_unpackhi_pi32(u00, u02)),
		_mm_movpi64_epi64(_mm_unpackhi_pi32(u10, u12)));

	__m128i P1 = _mm_unpacklo_epi64(
		_mm_movpi64_epi64(_mm_unpacklo_pi32(u01, u03)),
		_mm_movpi64_epi64(_mm_unpacklo_pi32(u11, u13)));

	__m128i X = _mm_sub_epi16(
		_mm_unpacklo_epi64(
			_mm_movpi64_epi64(_mm_unpacklo_pi32(u00, u02)),
			_mm_movpi64_epi64(_mm_unpacklo_pi32(u10, u12))),
		_mm_unpacklo_epi64(
			_mm_movpi64_epi64(_mm_unpackhi_pi32(u01, u03)),
			_mm_movpi64_epi64(_mm_unpackhi_pi32(u11, u13))));

	__m128i Y = _mm_sub_epi16(P1, P0);
	__m128i R = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(X, Y), _mm_slli_epi16(Y, 1)), *((__m128i*)UR_4)), 3);

	__m128i m1 = _mm_cmpgt_epi16(R,     t->L);
	__m128i m2 = _mm_cmpgt_epi16(t->NL, R   );

	__m128i r, D;

	r = _mm_or_si128(_mm_andnot_si128(m1, R), _mm_and_si128(_mm_sub_epi16(t->L2,  R), m1));
	r = _mm_or_si128(_mm_andnot_si128(m2, r), _mm_and_si128(_mm_sub_epi16(t->NL2, R), m2));

	r = _mm_andnot_si128(_mm_cmpgt_epi16(R,      t->L2), r);
	r = _mm_andnot_si128(_mm_cmpgt_epi16(t->NL2, R    ), r);

	P0 = _mm_add_epi16(P0, r);
	P1 = _mm_sub_epi16(P1, r);

	D = _mm_unpacklo_epi8(_mm_packus_epi16(P0, P0), _mm_packus_epi16(P1, P1));

	{
		UINT32 d0 = _mm_cvtsi128_si32(D);
		UINT32 d1 = _mm_cvtsi128_si32(D = _mm_srli_si128(D, 4)); /* 32 = 4*8 */
		UINT32 d2 = _mm_cvtsi128_si32(D = _mm_srli_si128(D, 4));
		UINT32 d3 = _mm_cvtsi128_si32(    _mm_srli_si128(D, 4));

		*((UINT16*)(p - 1 + 0 * s)) = (UINT16)d0;
		*((UINT16*)(p - 1 + 1 * s)) =         d0 >> 16;
		*((UINT16*)(p - 1 + 2 * s)) = (UINT16)d1;
		*((UINT16*)(p - 1 + 3 * s)) =         d1 >> 16;
		*((UINT16*)(p - 1 + 4 * s)) = (UINT16)d2;
		*((UINT16*)(p - 1 + 5 * s)) =         d2 >> 16;
		*((UINT16*)(p - 1 + 6 * s)) = (UINT16)d3;
		*((UINT16*)(p - 1 + 7 * s)) =         d3 >> 16;
	}
}

/* */

#if 0
static void Filter_LoopFilterV(
	const LoopFilter_t* t,
	UINT8*              b,
	INT32               s)
{
	const INT16* d = t->Delta + 127;

	INT32 p0[2];
	INT32 p1[2];

	INT32 q0[2];
	INT32 q1[2];

	UINT8* p   = b;
	UINT8* end = p + 8;

	p0[1] = 0;
	p1[1] = 0;
	q0[1] = 255;
	q1[1] = 255;

	for (; p < end; p++) {
		INT32 x = (p[-2 * s] - p[1 * s]) + 3 * (p[0] - p[-1 * s]);
		INT32 v = d[(x + 4) >> 3];

		p0[0] = p[-s] + v;
		p1[0] = p[ 0] - v;

		q0[0] = p0[(p0[0] < 0)];
		q1[0] = p1[(p1[0] < 0)];

		p[-s] = q0[(q0[0] > 255)];
		p[ 0] = q1[(q1[0] > 255)];
	}
}
#endif

static __inline void Filter_LoopFilterV_SSE2(
	const SFilter_t* t,
	UINT8*           p,
	INT32            s)
{
	const __m128i z = _mm_setzero_si128();

	__m128i P0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p - s)), z);
	__m128i P1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p + 0)), z);

	__m128i X = _mm_sub_epi16(
		_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p - 2 * s)), z),
		_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p +     s)), z));
	__m128i Y = _mm_sub_epi16(P1, P0);
	__m128i R = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(X, Y), _mm_slli_epi16(Y, 1)), *((__m128i*)UR_4)), 3);

	__m128i m1 = _mm_cmpgt_epi16(R,     t->L);
	__m128i m2 = _mm_cmpgt_epi16(t->NL, R   );

	__m128i r;

	r = _mm_or_si128(_mm_andnot_si128(m1, R), _mm_and_si128(_mm_sub_epi16(t->L2,  R), m1));
	r = _mm_or_si128(_mm_andnot_si128(m2, r), _mm_and_si128(_mm_sub_epi16(t->NL2, R), m2));

	r = _mm_andnot_si128(_mm_cmpgt_epi16(R,      t->L2), r);
	r = _mm_andnot_si128(_mm_cmpgt_epi16(t->NL2, R    ), r);

	P0 = _mm_add_epi16(P0, r);
	P1 = _mm_sub_epi16(P1, r);

	_mm_storel_epi64((__m128i*)(p - s), _mm_packus_epi16(P0, P0));
	_mm_storel_epi64((__m128i*)(p + 0), _mm_packus_epi16(P1, P1));
}

/* */

static void FrameLoopFilter_SSE2(
	FrameDecoder_t* t)
{
	INT32 i;
	INT32 x, y;

	const INT16* b = t->DC;

	Plane_t* plane = t->Frame[1];

	ALIGN(0x10) SFilter_t sf;

	SFilter_Setup(&sf, t->Filter.Limit);

	for (i = 0; i < 3; i++, plane++) {
		INT32 bx = t->Index->BX[i];
		INT32 by = t->Index->BY[i];

		UINT8* r0 = plane->Plane;

		for (y = 0; y < by; y++, r0 += plane->Pitch * 8) {
			UINT8* r = r0;

			for (x = 0; x < bx; x++, r += 8, b++) {
				if (*b != NOT_CODED) {
					if (x > 0) {
						Filter_LoopFilterH_SSE2(&sf, r, plane->Pitch);
					}

					if (y > 0) {
						Filter_LoopFilterV_SSE2(&sf, r, plane->Pitch);
					}

					if (x < bx - 1 && b[ 1] == NOT_CODED) {
						Filter_LoopFilterH_SSE2(&sf, r + 8, plane->Pitch);
					}

					if (y < by - 1 && b[bx] == NOT_CODED) {
						Filter_LoopFilterV_SSE2(&sf, r + 8 * plane->Pitch, plane->Pitch);
					}
				}
			}
		}
	}
}

/* */

void QT_ReconstructFrame_SSE2(
	FrameDecoder_t* t)
{
	Reconstruct_YPlane_SSE2(t);

	Reconstruct_CPlane_SSE2(t);

	if (t->Filter.Limit > 0) {
		FrameLoopFilter_SSE2(t);
	}
}

/* */

