#ifndef _TMSSE_VECTOR_HPP
#define _TMSSE_VECTOR_HPP

#include <sstream>
#include <xmmintrin.h>
#include <cstddef>//std::size_t
#include <cstdlib>
#include <cmath>
#include "../vector/vector_base.hpp"

/** 
 *  @file sse_vector.hpp
 *  @brief Defines tempest::sse_vector.
 *  @author ototoi / Toru Matsuoka
 *  @date 2004/12/24 
 */

//__BORLANDC__


#ifdef __GNU_C__
#	ifndef __SSE__
#	error "F_CK YOU!"
#	endif
#endif



#if defined(_MSC_VER) || defined(__ICC)
//Code with Micro$oft compiler
#define  IOF(m,i) (m) ## .m128_f32[i]
#define CIOF(m,i) (m) ## .m128_f32[i] 

#else

#define  IOF(m,i) reinterpret_cast<float*>(&(m))[i]
#define CIOF(m,i) reinterpret_cast<const float*>(&(m))[i]

#endif

//m128_f32
namespace tempest{
	
/** 
 *  @class sse_vector
 *  @brief sse_vector is vector class with SSE. 
 *  @code
 *  tempest::sse_vector v;
 *  @endcode    
 */
	
class sse_vector;

class sse_vector:public vector_base<sse_vector,float,4>{
private:
	typedef		float					T;
public:
	typedef		T										value_type;		///< type of element
	typedef		T&										reference;		///< reference					
	typedef		const T&								const_reference;///< const reference
	typedef		sse_vector								this_type;		///< self

	typedef		std::size_t								size_type;		///< type of element size
	typedef		std::ptrdiff_t							difference_type;///< difference type of pointer

	typedef		T*										iterator;		///< iterator
	typedef		const T*								const_iterator;	///< const iterator
	
	typedef 	T										param_type;		///< This is for parameter.
	typedef		const this_type &						param_this_type;
	
public:
	static const std::size_t c_size = 4;
	
public:
	//-----------------------------------------------
	//functions for iterator
	iterator		begin()			{return &( IOF(m,0));}
	iterator		end()			{return &( IOF(m,4));}
	const_iterator	begin()	const	{return &(CIOF(m,0));}
	const_iterator	end()	const	{return &(CIOF(m,4));}
	//....?? fuck hack 3 death....
	
public:
	sse_vector(){}
	
	sse_vector(const __m128 rhs):m(rhs){}
	
	sse_vector(const sse_vector& rhs):m(rhs.m){}
	
	explicit sse_vector(float a /**/, float b = 0, float c = 0, float d = 0):m(_mm_setr_ps (a,b,c,d)){}
	
	explicit sse_vector(float * array):m(_mm_loadu_ps(array)){}
	
	
	~sse_vector(){}
	
	//-----------------------------------------------
	//inserters
	this_type& operator = (const __m128 rhs){
		m = rhs;
    	return *this;
	}
	
	this_type& operator = (const sse_vector& rhs){
		m = rhs.m;
    	return *this;
	}
	
	//-----------------------------------------------
	//capacity
	size_type size ()     const { return c_size; }
    size_type max_size () const { return c_size; }
    bool      empty ()    const { return false;	 }
	
	//-----------------------------------------------
	//operators
	
	this_type& negate(){
		m = _mm_sub_ps(_mm_setzero_ps(),m);	
		return *this;
	}
	
	this_type& operator += (param_this_type rhs){
		m = _mm_add_ps(m,rhs.get());	
		return *this;
	}
	
	this_type& operator -= (param_this_type rhs){
		m = _mm_sub_ps(m,rhs.get());
		return *this;
	}
	
	this_type& operator *= (param_this_type rhs){
		m = _mm_mul_ps(m,rhs.get());	
		return *this;
	}
	
	this_type& operator /= (param_this_type rhs){
		__m128 r = _mm_rcp_ps(rhs.get());			
		m = _mm_mul_ps(m,_mm_mul_ps(r,_mm_sub_ps(_mm_set1_ps(2),_mm_mul_ps(rhs.get(),r))));
		
		//m = _m_div_ps(m,rhs.m);		
		return *this;
	}
	
	
	this_type& operator *= (float rhs){
		m = _mm_mul_ps(m,_mm_set1_ps(rhs));
		return *this;
	}
	
	this_type& operator /= (float rhs){
		__m128 x,r;
		
		x = _mm_set_ss(rhs);
		
		r = _mm_rcp_ss(x);	
		r = _mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2),_mm_mul_ss(x,r)));//R(i+1) = R * (2 - x * R)
		r = _mm_shuffle_ps(r,r,0);//ss->ps
		
		m = _mm_mul_ps(m,r);
		//m = _mm_div_ps(m,_mm_load1_ps(&rhs));
		return *this;
	}
	
	//--------------------------------
	
	value_type& operator[](size_type i){
		return IOF(m,i);
	}
	
	value_type operator[](size_type i) const {
		return CIOF(m,i);
	}
	
	//-----------------------------------
	//
	
	void get(float *array) const {_mm_storeu_ps(array,m);}
	
	__m128& get() {return m;}
	const __m128&  get() const {return m;}
	
	void set(std::size_t index,float f){IOF(m,index) = f;}
	void set(const float * array){m = (_mm_loadu_ps(array));}
	
	void assign(const float * a,const float * b){
		assert(b-a<=static_cast<int>(c_size));
		this->set(a);
	}
	

	//-----------------------------------------------
	//utilities
    value_type length() const {    	
    	return std::sqrt(sqr_length());
	}
	value_type sqr_length() const {
		__m128 r = _mm_mul_ps(m,m);
		
		__m128 x = _mm_shuffle_ps(r,r,_MM_SHUFFLE(1,0,3,2));	//2,1,4,3 3434
		x = _mm_add_ps(r,x);									//6,4,6,4 4668 
		r = _mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1));			//4,6,4,6 6666
		x = _mm_add_ps(r,x);								
		
		return CIOF(x,0);
		
	}

	
	value_type sum() const{
		__m128 r = m;
		__m128 x = _mm_shuffle_ps(r,r,_MM_SHUFFLE(1,0,3,2));	// 2, 1, 4, 3 
		x = _mm_add_ps(r,x);									// 6, 4, 6, 4 
		r = _mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1));			// 4, 6, 4, 6 
		x = _mm_add_ps(r,x);									//10,10,10,10 sum of
		return CIOF(x,0);
	}

	
	this_type& normalize(){
		//T length = sqr_length(); 		//||V||^2
    	//if (length == 0.0f) return *this;
		__m128 r = _mm_mul_ps(m,m);								// 4, 3, 2, 1
		__m128 x = _mm_shuffle_ps(r,r,_MM_SHUFFLE(1,0,3,2));	// 2, 1, 4, 3 
		x = _mm_add_ps(r,x);									// 6, 4, 6, 4 
		r = _mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1));			// 4, 6, 4, 6 
		x = _mm_add_ps(r,x);									//10,10,10,10 sum of 
		//if(_mm_comieq_ss(x,_mm_setzero_ps()))return *this;		// if(x == 0)return ;
			
		//------------------------
#if 1
		//all ps
		r = _mm_rsqrt_ps(x);
		r = _mm_mul_ps(r,_mm_sub_ps(_mm_set1_ps(1.5f),_mm_mul_ps(_mm_set1_ps(0.5f),_mm_mul_ps(x,_mm_mul_ps(r,r)) )  ) );//r = r*(1.5f - 0.5f  * x*r*r)  		
#else
		//ss
		r = _mm_rsqrt_ss(x);
		r = _mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(1.5f),_mm_mul_ss(_mm_set_ss(0.5f),_mm_mul_ss(x,_mm_mul_ss(r,r)) )  ) );//r = r*(1.5f - 0.5f  * x*r*r)  
		r = _mm_shuffle_ps(r,r,0);//ss->ps
#endif	
		m = _mm_mul_ps(m,r);

    	return *this;
	}
	
public:
	const char* debug()const{return "tempest::sse_vector";}

	
private:
	__m128 m;
};

inline sse_vector operator+ (const sse_vector& rhs){
	return rhs;
}
	
inline sse_vector operator- (const sse_vector& rhs){
	return sse_vector(_mm_sub_ps(_mm_setzero_ps(),rhs.get()));
}


#define DACLARE_OP(OP)																\
inline sse_vector operator OP (const sse_vector& lhs, const sse_vector& rhs){		\
	return sse_vector(lhs) OP ## = rhs;												\
}

	DACLARE_OP(+)
	DACLARE_OP(-)
	DACLARE_OP(*)
	DACLARE_OP(/)

#undef DACLARE_OP

inline sse_vector operator * (const sse_vector& lhs, float rhs){
	return sse_vector(lhs) *= rhs;
}
inline sse_vector operator * (float rhs, const sse_vector& lhs){
	return sse_vector(lhs) *= rhs;
}

inline sse_vector operator / (const sse_vector& lhs, float rhs){
	return sse_vector(lhs) /= rhs;
}


//-----------------------------------------------	
//utility functions
inline sse_vector normalize(const sse_vector& rhs){
	return sse_vector(rhs).normalize();	
}

inline float length(const sse_vector& rhs){
	return rhs.length();	
}
	
inline float sqr_length(const sse_vector& rhs){
	return rhs.sqr_length();	
}
	
inline float sum(const sse_vector& rhs){
	return rhs.sum();	
}

inline float dot(const sse_vector& lhs, const sse_vector& rhs){
	return (lhs * rhs).sum();
}

inline sse_vector cross(const sse_vector& lhs, const sse_vector& rhs){
	__m128 a = lhs.get();// 3210 0123
	__m128 b = rhs.get();//

	return sse_vector(
		_mm_sub_ps(
			_mm_mul_ps( _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,0,2,1)), _mm_shuffle_ps(b,b,_MM_SHUFFLE(3,1,0,2))  ),
			_mm_mul_ps( _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,1,0,2)), _mm_shuffle_ps(b,b,_MM_SHUFFLE(3,0,2,1))  )
		)
	);
}

//-----------------------------------------------
//compare
template<class T>
inline bool operator== (const sse_vector& lhs, const sse_vector& rhs){
	return (lhs.get() == rhs.get());
}

template<class T>
inline bool operator!= (const sse_vector& lhs, const sse_vector& rhs){
	return (lhs.get() != rhs.get());
}

//-----------------------------------------------
//output
/** 
 *	@name output
 */
//@{
	
/** 
 *	ostream << 
 */
template<typename _CharT, class _Traits>
std::basic_ostream<_CharT, _Traits>& operator<<(std::basic_ostream<_CharT, _Traits>& os, const sse_vector& rhs){
	
	
	//rhs.get(f);
	std::basic_ostringstream<_CharT, _Traits> s;
	s.flags(os.flags());
	s.imbue(os.getloc());
	s.precision(os.precision());
	s << "(";
	for(std::size_t i = 0; i < 3; ++i){
		s << rhs[i] <<",";
	}
	s <<rhs[4-1] <<")";
	return os << s.str();
}

//@}
	
#undef IOF
#undef CIOF

}//End of namespace

#endif

