/* CSConverter_SSE2.c */
/* 2009/06/19         */

#include "StdAfx.h"

#include "TheoraDecoder.h"

#include "CSConverter.h"

/* */

#pragma warning(disable : 4799)

/* */

static __inline void CopyCSC_16(
	UINT8*       d,
	const UINT8* s,
	INT32        cx)
{
	UINT8* p = d;
	UINT8* e = p + cx;

	const UINT8* q = s;

	for (; p < e; p += 16, q += 16) {
		_mm_store_si128((__m128i*)p, _mm_load_si128((const __m128i*)q));
	}
}

static __inline void CopyCSC_8(
	UINT8*       d,
	const UINT8* s,
	INT32        cx)
{
	UINT8* p = d;
	UINT8* e = p + cx;

	const UINT8* q = s;

	for (; p < e; p += 8, q += 8) {
		*((__m64*)p) = *((const __m64*)q);
	}
}

/* */

void QT_CSConvert_YV12_SSE2(
	const QT_Output_t* output,
	QT_Frame_t*        frame)
{
	UINT8* pb0 = (UINT8*)(frame->Frame);
	UINT8* pb1 = pb0 + frame->Rasters * frame->Pitch;
	UINT8* pb2 = pb1 + frame->Rasters * frame->Pitch / 4;
	UINT8* end;

	INT32 r0 = output->CY - frame->Y;

	const UINT8* s0 = output->Plane[0] + frame->X + (r0     - 1) * output->CX;
	const UINT8* s1 = output->Plane[2] + frame->X + (r0 / 2 - 1) * output->CX / 2;
	const UINT8* s2 = output->Plane[1] + frame->X + (r0 / 2 - 1) * output->CX / 2;

	INT32 cx2 = frame->CX / 2;

	end = pb0 + frame->CY * frame->Pitch;
	while (pb0 < end) {
		CopyCSC_16(pb0, s0, frame->CX);
		pb0 += frame->Pitch;
		s0  -= output->CX;
	}

	if ((cx2 & 0xf) == 0) {
		end = pb1 + (frame->CY / 2) * (frame->Pitch / 2);
		while (pb1 < end) {
			CopyCSC_16(pb1, s1, cx2);
			pb1 += frame->Pitch / 2;
			s1  -= cx2;
		}

		end = pb2 + (frame->CY / 2) * (frame->Pitch / 2);
		while (pb2 < end) {
			CopyCSC_16(pb2, s2, cx2);
			pb2 += frame->Pitch / 2;
			s2  -= cx2;
		}

	} else {
		end = pb1 + (frame->CY / 2) * (frame->Pitch / 2);
		while (pb1 < end) {
			CopyCSC_8(pb1, s1, cx2);
			pb1 += frame->Pitch / 2;
			s1  -= cx2;
		}

		end = pb2 + (frame->CY / 2) * (frame->Pitch / 2);
		while (pb2 < end) {
			CopyCSC_8(pb2, s2, cx2);
			pb2 += frame->Pitch / 2;
			s2  -= cx2;
		}
	}

	_mm_empty();
}

void QT_CSConvert_YUY2_SSE2(
	const QT_Output_t* output,
	QT_Frame_t*        frame)
{
	UINT8* pb  = (UINT8*)(frame->Frame);
	UINT8* end = pb + frame->CY * frame->Pitch;

	INT32 r0 = output->CY - frame->Y;

	const UINT8* s0 = output->Plane[0] + frame->X + (r0     - 1) * output->CX;
	const UINT8* s1 = output->Plane[1] + frame->X + (r0 / 2 - 1) * output->CX / 2;
	const UINT8* s2 = output->Plane[2] + frame->X + (r0 / 2 - 1) * output->CX / 2;

	__m128i Y0, Y1, UV0, UV1;
	__m128i P0, P1;

	for (; pb < end; pb += frame->Pitch * 2, s0 -= output->CX * 2, s1 -= output->CX / 2, s2 -= output->CX / 2) {
		UINT8* pb0 = pb;
		UINT8* pb1 = pb + frame->Pitch;
		UINT8* pe0 = pb + frame->CX * 2;

		const UINT8* y0 = s0;
		const UINT8* y1 = s0 - output->CX;
		const UINT8* u  = s1;
		const UINT8* v  = s2;

		for (; pb0 < pe0; pb0 += 16 * 2, pb1 += 16 * 2, y0 += 16, y1 += 16, u += 8, v += 8) {
			Y0 = _mm_load_si128((const __m128i*)y0);
			Y1 = _mm_load_si128((const __m128i*)y1);

			UV0 = _mm_unpacklo_epi8(
				_mm_loadl_epi64((const __m128i*)u),
				_mm_loadl_epi64((const __m128i*)v));
			UV1 = _mm_unpackhi_epi64(UV0, UV0);

			P0 = _mm_unpacklo_epi8(Y0, UV0);

			Y0 = _mm_unpackhi_epi64(Y0, Y0);

			P1 = _mm_unpacklo_epi8(Y0, UV1);

			_mm_store_si128((__m128i*)(pb0 +  0), P0);
			_mm_store_si128((__m128i*)(pb0 + 16), P1);

			P0 = _mm_unpacklo_epi8(Y1, UV0);

			Y1 = _mm_unpackhi_epi64(Y1, Y1);

			P1 = _mm_unpacklo_epi8(Y1, UV1);

			_mm_store_si128((__m128i*)(pb1 +  0), P0);
			_mm_store_si128((__m128i*)(pb1 + 16), P1);
		}
	}
}

/* */

