/* TTADecoder.c */
/* 2008/12/18   */

#include "StdAfx.h"

#include "TTADecoder.h"

/* */

/* BIT_MASK */
static const UINT32 BIT_MASK[] = {
	0x00000000, 0x00000001, 0x00000003, 0x00000007,
	0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f,
	0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff,
	0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff,
	0x0000ffff, 0x0001ffff, 0x0003ffff, 0x0007ffff,
	0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff,
	0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff,
	0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff,
	0xffffffff
}; /* BIT_MASK */

/* BIT_SHIFT */
static const UINT32 BIT_SHIFT[] = {
	0x00000001, 0x00000002, 0x00000004, 0x00000008,
	0x00000010, 0x00000020, 0x00000040, 0x00000080,
	0x00000100, 0x00000200, 0x00000400, 0x00000800,
	0x00001000, 0x00002000, 0x00004000, 0x00008000,
	0x00010000, 0x00020000, 0x00040000, 0x00080000,
	0x00100000, 0x00200000, 0x00400000, 0x00800000,
	0x01000000, 0x02000000, 0x04000000, 0x08000000,
	0x10000000, 0x20000000, 0x40000000, 0x80000000,
	0x80000000, 0x80000000, 0x80000000, 0x80000000,
	0x80000000, 0x80000000, 0x80000000, 0x80000000
}; /* BIT_SHIFT */

/* SHIFT_16 */
static const UINT32* SHIFT_16 = &(BIT_SHIFT[4]);

/* */

/* BitDecoder */
struct BitDecoder {

	UINT64 Cache;
	INT32  Count;

	const UINT8* p;
	const UINT8* end;

}; /* BitDecoder */

typedef struct BitDecoder BitDecoder_t;

static void BitDecoder_Init(BitDecoder_t* t, const UINT8* buffer, SIZE_T length)
{
	t->Cache = 0;
	t->Count = 0;

	t->p   = buffer;
	t->end = buffer + length;
}

static BOOL BitDecoder_ReadBinary(BitDecoder_t* t, INT32 bits, UINT32* b)
{
	if (t->Count < bits) {
		if (t->p >= t->end) {
			return FALSE;
		}

		t->Cache |= ((UINT64)(*((UINT32*)(t->p))) << t->Count);
		t->Count += ((t->end - t->p) >= 4) ? 4 * 8 : (INT32)(t->end - t->p) * 8;

		t->p += 4;
	}

	*b = (UINT32)(t->Cache) & BIT_MASK[bits];

	t->Cache >>= bits;
	t->Count  -= bits;

	return TRUE;
}

static BOOL BitDecoder_ReadUnary(BitDecoder_t* t, UINT32* u)
{
	UINT32 v = 0;

	while ((t->Cache & BIT_MASK[t->Count]) == BIT_MASK[t->Count]) {
		if (t->p >= t->end) {
			return FALSE;
		}

		v += t->Count;

		t->Cache = *((UINT32*)(t->p));
		t->Count = ((t->end - t->p) >= 4) ? 4 * 8 : (INT32)(t->end - t->p) * 8;

		t->p += 4;
	}

	while ((t->Cache & 0xff) == 0xff) {
		v += 8;
		t->Cache >>= 8;
		t->Count  -= 8;
	}

	while ((t->Cache & 1) == 1) {
		v += 1;
		t->Cache >>= 1;
		t->Count  -= 1;
	}

	t->Cache >>= 1;
	t->Count  -= 1;

	*u = v;

	return TRUE;
}

static BOOL BitDecoder_EOS(BitDecoder_t* t)
{
	if (t->p >= t->end && t->Count < 8) {
		if ((t->Cache & BIT_MASK[t->Count]) == 0) {
			return TRUE;
		}
	}

	return FALSE;
}

/* */

/* Adapt */
struct Adapt {

	UINT32 k0;
	UINT32 k1;

	UINT32 sum0;
	UINT32 sum1;

}; /* Adapt */

typedef struct Adapt Adapt_t;

static void Adapt_Init(Adapt_t* t, UINT32 k0, UINT32 k1)
{
	t->k0 = k0;
	t->k1 = k1;

	t->sum0 = SHIFT_16[k0];
	t->sum1 = SHIFT_16[k1];
}

static BOOL Adapt_Decode(Adapt_t* t, BitDecoder_t* d, INT32* v)
{
	UINT32 val = 0;

	UINT32 depth, k;

	UINT32 unary;
	if (!BitDecoder_ReadUnary(d, &unary)) {
		return FALSE;
	}

	if (unary == 0) {
		depth = 0;
		k = t->k0;
	} else {
		depth = 1;
		k = t->k1;
		unary -= 1;
	}

	if (k > 0) {
		UINT32 b;
		if (!BitDecoder_ReadBinary(d, k, &b)) {
			return FALSE;
		}
		val = (unary << k) | b;
	} else {
		val = unary;
	}

	if (depth == 1) {
		t->sum1 += val - (t->sum1 >> 4);
		if (t->k1 > 0 && t->sum1 < SHIFT_16[t->k1]) {
			t->k1 -= 1;
		} else if (t->sum1 > SHIFT_16[t->k1 + 1]) {
			t->k1 += 1;
		}
		val += BIT_SHIFT[t->k0];
	}

	{
		t->sum0 += val - (t->sum0 >> 4);
		if (t->k0 > 0 && t->sum0 < SHIFT_16[t->k0]) {
			t->k0 -= 1;
		} else if (t->sum0 > SHIFT_16[t->k0 + 1]) {
			t->k0 += 1;
		}
	}

	*v = (INT32)val;

	return TRUE;
}

/* */

static const INT32 FLT_SET[4][2] = {
	{ 10, 1 }, /*  8 */
	{  9, 1 }, /* 16 */
	{ 10, 1 }, /* 24 */
	{ 12, 0 }  /* 32 */
};

#define TTA_MAX_ORDER 16

/* Filter */
struct Filter {

	INT32 shift;
	INT32 round;
	INT32 error;
	INT32 mutex;

	INT32 qm[TTA_MAX_ORDER];
	INT32 dx[TTA_MAX_ORDER];
	INT32 dl[TTA_MAX_ORDER];

}; /* Filter */

typedef struct Filter Filter_t;

static void Filter_Init(Filter_t* t, INT32 shift, INT32 mode)
{
	memset(t, 0, sizeof(Filter_t));

	t->shift = shift;
	t->round = 1 << (shift - 1);
	t->mutex = mode;
}

static void Filter_Process(Filter_t* t, INT32* s)
{
	INT32* a = t->dl;
	INT32* b = t->qm;
	INT32* m = t->dx;

	INT32 sum = t->round;

	if (t->error == 0) {
		sum += a[0] * b[0];
		sum += a[1] * b[1];
		sum += a[2] * b[2];
		sum += a[3] * b[3];
		sum += a[4] * b[4];
		sum += a[5] * b[5];
		sum += a[6] * b[6];
		sum += a[7] * b[7];

	} else if (t->error < 0) {
		sum += a[0] * (b[0] -= m[0]);
		sum += a[1] * (b[1] -= m[1]);
		sum += a[2] * (b[2] -= m[2]);
		sum += a[3] * (b[3] -= m[3]);
		sum += a[4] * (b[4] -= m[4]);
		sum += a[5] * (b[5] -= m[5]);
		sum += a[6] * (b[6] -= m[6]);
		sum += a[7] * (b[7] -= m[7]);

	} else {
		sum += a[0] * (b[0] += m[0]);
		sum += a[1] * (b[1] += m[1]);
		sum += a[2] * (b[2] += m[2]);
		sum += a[3] * (b[3] += m[3]);
		sum += a[4] * (b[4] += m[4]);
		sum += a[5] * (b[5] += m[5]);
		sum += a[6] * (b[6] += m[6]);
		sum += a[7] * (b[7] += m[7]);
	}

	m[8-0] = ((a[8-1] >> 30) | 1) << 2;
	m[8-1] = ((a[8-2] >> 30) | 1) << 1;
	m[8-2] = ((a[8-3] >> 30) | 1) << 1;
	m[8-3] = ((a[8-4] >> 30) | 1);

	t->error = *s;
	*s += (sum >> t->shift);
	a[8] = *s;

	if (t->mutex != 0) {
		a[8-1] = a[8-0] - a[8-1];
		a[8-2] = a[8-1] - a[8-2];
		a[8-3] = a[8-2] - a[8-3];
	}

	a[0] = a[1];
	a[1] = a[2];
	a[2] = a[3];
	a[3] = a[4];
	a[4] = a[5];
	a[5] = a[6];
	a[6] = a[7];
	a[7] = a[8];

	m[0] = m[1];
	m[1] = m[2];
	m[2] = m[3];
	m[3] = m[4];
	m[4] = m[5];
	m[5] = m[6];
	m[6] = m[7];
	m[7] = m[8];
}

/* */

static BOOL TTADecoder_DecodeMonoFrame(BitDecoder_t* d, INT32* samples, INT32 count)
{
	ALIGN(0x10) Adapt_t  a;
	ALIGN(0x10) Filter_t f;

	INT32 last = 0;

	INT32* p   = samples;
	INT32* end = p + count;

	Adapt_Init (&a, 10, 10);
	Filter_Init(&f, FLT_SET[1][0], FLT_SET[1][1]);

	while (p < end) {
		INT32 value;
		if (!Adapt_Decode(&a, d, &value)) {
			return FALSE;
		}

		*p = ((value & 1) != 0) ? ((value + 1) >> 1) : (-value >> 1);
		Filter_Process(&f, p);
		last = (*p += (INT32)((((UINT64)last << 5) - last) >> 5));

		p += 1;

		if (p < end && BitDecoder_EOS(d)) {
			memset(p, 0, (end - p) * sizeof(INT32));
			break;
		}
	}

	return TRUE;
}

static BOOL TTADecoder_DecodeStereoFrame(BitDecoder_t* d, INT32* samples, INT32 count)
{
	ALIGN(0x10) Adapt_t  a[2];
	ALIGN(0x10) Filter_t f[2];

	INT32 i;
	INT32 last[2] = { 0, 0 };

	INT32* p   = samples;
	INT32* end = p + count * 2;

	for (i = 0; i < 2; i++) {
		Adapt_Init (a + i, 10, 10);
		Filter_Init(f + i, FLT_SET[1][0], FLT_SET[1][1]);
	}

	while (p < end) {
		for (i = 0; i < 2; i++) {
			INT32 value;
			if (!Adapt_Decode(a + i, d, &value)) {
				return FALSE;
			}

			p[i] = ((value & 1) != 0) ? ((value + 1) >> 1) : (-value >> 1);
			Filter_Process(f + i, p + i);
			last[i] = (p[i] += (INT32)((((UINT64)last[i] << 5) - last[i]) >> 5));
		}

		p[1] += p[0] / 2;
		p[0]  = p[1] - p[0];

		p += 2;

		if (p < end && BitDecoder_EOS(d)) {
			memset(p, 0, (end - p) * sizeof(INT32));
			break;
		}
	}

	return TRUE;
}

/* */

typedef (*Decode_t)(BitDecoder_t*, INT32*, INT32);

/* QTTADecoder */
struct QTTADecoder {

	INT32 Channels;
	INT32 Samples;

	INT32* Frame;
	INT16* Output;

	Decode_t Decode;

}; /* QTTADecoder */

/* */

QTTADecoder_t* QTTA_CreateDecoder(void)
{
	QTTADecoder_t* p = (QTTADecoder_t*)malloc(sizeof(QTTADecoder_t));
	if (p == NULL) {
		return NULL;
	}

	memset(p, 0, sizeof(QTTADecoder_t));

	return p;
}

void QTTA_ReleaseDecoder(QTTADecoder_t* d)
{
	if (d == NULL) {
		return;
	}

	_aligned_free(d->Frame);
	_aligned_free(d->Output);

	free(d);
}

BOOL QTTA_SetupDecoder(
	QTTADecoder_t* d,
	INT32          channels,
	INT32          samples)
{
	if (channels != 1 && channels != 2) {
		return FALSE;
	}

	if (samples <= 0) {
		return FALSE;
	}

	d->Channels = channels;
	d->Samples  = samples;

	d->Frame = (INT32*)_aligned_malloc(
		sizeof(INT32) * channels * samples,
		0x10);
	if (d->Frame == NULL) {
		return FALSE;
	}

	d->Output = (INT16*)_aligned_malloc(
		sizeof(INT16) * channels * samples,
		0x10);
	if (d->Output == NULL) {
		return FALSE;
	}

	switch (channels) {
	case 1:
		d->Decode = TTADecoder_DecodeMonoFrame;
		break;

	case 2:
		d->Decode = TTADecoder_DecodeStereoFrame;
		break;
	}

	return TRUE;
}

BOOL QTTA_DecodeFrame(
	QTTADecoder_t*   d,
	const VOID*      packet,
	SIZE_T           size,
	QTTA_Output_t*   output)
{
	ALIGN(0x10) BitDecoder_t b;
	BitDecoder_Init(&b, (const UINT8*)packet, size);

	if (!d->Decode(&b, d->Frame, d->Samples)) {
		return FALSE;
	}

	{
		INT16* p   = d->Output;
		INT16* end = p + d->Channels * d->Samples;

		const INT32* s = d->Frame;
		while (p < end) {
			*(p++) = (INT16)(*(s++));
		}
	}

	output->Channels = d->Channels;
	output->Length   = d->Samples;
	output->Sample   = d->Output;

	return TRUE;
}

/* */

