#pragma once

#include <vector>

template <typename T>
struct Array2D
{
	size_t width_;
	size_t height_;
	T* pBuff_;
	bool isAllocated_;
	
	void allocate() {
//		pBuff_ = new T[width_ * height_];
		pBuff_ = (T*) _aligned_malloc(sizeof(T) * width_ * height_, 32);
	}

	Array2D(const Array2D& arr)
		:
		width_(arr.width_),
		height_(arr.height_),
		isAllocated_(true)
	{
		allocate();
		std::copy(arr.pBuff_, arr.pBuff_+width_*height_, pBuff_);
	}

	Array2D(size_t width, size_t height, T* pBuff)
		:
		width_(width),
		height_(height),
		pBuff_(pBuff),
		isAllocated_(false)
	{
	}

	Array2D(size_t width, size_t height)
		:
		width_(width),
		height_(height),
		isAllocated_(true)
	{
		allocate();
	}

	~Array2D()
	{
		if (isAllocated_) {
//			delete pBuff_;
			_aligned_free(pBuff_);
		}
	}
	
	__forceinline
	T* operator[] (int row) {
		return &pBuff_[row * width_];
	}
	
	__forceinline
	const T* operator[] (int row) const {
		return &pBuff_[row * width_];
	}
	
	Array2D<T>& operator *= (const T& scalar) {
#if 1
		for (size_t i=0; i<width_*height_; ++i) {
			pBuff_[i] *= scalar;
		}
#else
		for (int i=0; i<width_; i++) {
			for (int j=0; j<height_; j++) {
				(*this)[j][i] *= scalar;
			}
		}
#endif
		return *this;
	}
	
	template <typename T2>
	Array2D<T> operator * (const T2& scalar) {
		Array2D<T> result(*this);
		result *= scalar;
		return result;
	}

	std::vector<T> operator * (const std::vector<T>& vec) {
		std::vector<T> result(height_);
		T sum;
		for (int row=0; row<height_; row++) {
			sum = 0;
			for (int col=0; col<width_; col++) {
				sum += (*this)[row][col] * vec[col];
			}
			result[row] = sum;
		}
		return result;
	}
	
	Array2D<T>& multiply_row_scalar(int row, double mult) {
		for (int i=0; i<width_; i++) {
			(*this)[row][i] *= mult;
		}
		return *this;
	}

	Array2D<T>& add_row_multiple(int from_row, int to_row, double mult) {
		for (int i=0; i<width_; ++i) {
			(*this)[to_row][i] += mult*(*this)[from_row][i];
		}
		return *this;
	}

	// We use simple Gaussian elimination - perf doesn't matter since
	// the matrices will be K x K, where K = number of palette entries.
	Array2D<T> matrix_inverse() {
		Array2D<T> result(width_, height_);
		Array2D<T>& a = *this;

		// Set result to identity matrix
		result *= 0;
		for (int i=0; i<width_; i++) {
			result[i][i] = 1;
		}
		// Reduce to echelon form, mirroring in result
		for (int i=0; i<width_; i++) {
			result.multiply_row_scalar(i, 1/a[i][i]);
			multiply_row_scalar(i, 1/a[i][i]);
			for (int j=i+1; j<height_; j++) {
				result.add_row_multiple(i, j, -a[j][i]);
				add_row_multiple(i, j, -a[j][i]);
			}
		}
		// Back substitute, mirroring in result
		for (int i=width_-1; i>=0; i--) {
			for (int j=i-1; j>=0; j--) {
				result.add_row_multiple(i, j, -a[j][i]);
				add_row_multiple(i, j, -a[j][i]);
			}
		}
		// result is now the inverse
		return result;
	}

};

template <typename T>
Array2D<T> operator * (T scalar, const Array2D<T>& a) {
	Array2D<T> tmp = a;
	return tmp * scalar;
}

template <typename T>
struct Array3D
{
private:
	T* pBuff_;
public:
	size_t width_;
	size_t height_;
	size_t depth_;
	bool isAllocated_;
	
	Array3D(size_t width, size_t height, size_t depth, T* pBuff)
		:
		width_(width),
		height_(height),
		depth_(depth),
		pBuff_(pBuff),
		isAllocated_(false)
	{
	}
	
	Array3D(size_t width, size_t height, size_t depth)
		:
		width_(width),
		height_(height),
		depth_(depth),
		isAllocated_(true)
	{
		pBuff_ = new T[width*height*depth];
	}

	~Array3D()
	{
		delete pBuff_;
	}

/*
	Array2D<T> operator[] (int depth) {
		return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
	}
	Array2D<T> operator[] (int depth) const {
		return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
	}
*/
	__forceinline
	T& operator() (size_t x, size_t y, size_t z) {
		return pBuff_[
			  y * width_ * depth_
			+ x * depth_
			+ z
		];
	}
	__forceinline
	const T& operator() (size_t x, size_t y, size_t z) const {
		return pBuff_[
			  y * width_ * depth_
			+ x * depth_
			+ z
		];
	}
};


