#ifndef MATHOPS_AVX_X4_H
#define MATHOPS_AVX_X4_H

///////////////////////////////////
// Math ops for 4-way SSE code
// Author: Angus F. Hewlett
// Copyright FXpansion Audio UK Ltd. 2012
///////////////////////////////////

#include <immintrin.h>

#if WIN32 || __ICC
#define VECPRE32 __declspec(align(32))
#define VECPOST32
#else
#define VECPRE32 
#define VECPOST32 __attribute__ ((aligned 32))
#endif

////////////////////////////////
// Math operations for 4-way AVX data
////////////////////////////////


class Mathops_AVXx4
{
public:
	static const int32 vec_elem = 32;
	static const int vec_2pow = 8;
	static const int interleave = 4;
	static const int raw_vec_elem = 8;
	static const int raw_vec_2pow = 3;

	typedef __m256 vec_unaligned_float;
	typedef __m256i vec_unaligned_int;

	typedef VECPRE32 vec_unaligned_float vec_float_t VECPOST32;
	typedef VECPRE32 vec_unaligned_int vec_int_t VECPOST32;


	class VECPRE32 vec_float 
	{
	public:
		vec_float_t m[interleave];
		vforceinline vec_float& operator+(const vec_float& m2)
		{
			return *this = addps(*this, m2);
		};
		vforceinline vec_float& operator*(const vec_float& m2)
		{
			return *this = mulps(*this, m2);
		};
		vforceinline vec_float& operator-(const vec_float& m2)
		{
			return *this = subps(*this, m2);
		};
		vforceinline vec_float() {};	// Default ctor doesn't initialize. standard c++.
		vforceinline vec_float(const vec_float_t f[interleave]) { *this = f; };
		vforceinline vec_float(const float& f) { *this = f; };
		vforceinline vec_float& operator=(const float& f) { *this = set1ps(f); return *this; };
		vforceinline vec_float& operator=(const vec_float_t f[interleave]) { for (int i = 0; i < interleave; i++) m[i] = f[i]; return *this; };
		vforceinline vec_float& operator+=(const float& f) { *this += set1ps(f); return *this; };
		vforceinline vec_float& operator+=(const vec_float& f) { *this = *this + f; return *this; };
	//	vforceinline operator vec_float_t() const { return m; };
	} VECPOST32;


	static vforceinline vec_float zerops()         { vec_float rv; for (int i = 0; i < interleave; i++) rv.m[i] = _mm256_setzero_ps (); return rv;};
	static vforceinline vec_float set1ps(const float& q1)       { vec_float rv; for (int i = 0; i < interleave; i++) rv.m[i] = _mm256_set1_ps(q1); return rv; };
	static vforceinline vec_float addps(const vec_float& q1, const vec_float& q2)    { vec_float rv; for (int i = 0; i < interleave; i++) rv.m[i] = _mm256_add_ps (q1.m[i], q2.m[i]); return rv; };
	static vforceinline vec_float subps(const vec_float& q1, const vec_float& q2)    { vec_float rv; for (int i = 0; i < interleave; i++) rv.m[i] = _mm256_sub_ps (q1.m[i], q2.m[i]); return rv; };
	static vforceinline vec_float mulps(const vec_float& q1, const vec_float& q2)    { vec_float rv; for (int i = 0; i < interleave; i++) rv.m[i] = _mm256_mul_ps (q1.m[i], q2.m[i]); return rv; };
};

#endif