#pragma once
#include <xmmintrin.h>
#include <emmintrin.h>
#include <math.h>
#include <stdint.h>

#define YR_ACCURACY (0x00001000l)

__declspec(align(16)) struct _yrtag_vec4f
{
	union
	{
		__m128 m;
		float c[4];
		struct
		{
			float x;
			float y;
			float z;
			float w;
		};
	};
};
typedef struct _yrtag_vec4f vec4f;

static vec4f vec4f_neg(const vec4f v) { vec4f out; out.m = _mm_sub_ps(_mm_setzero_ps(), v.m); return out; }
static vec4f vec4f_add(const vec4f v, const vec4f w) { vec4f out = {_mm_add_ps(v.m, w.m)}; return out; }
static vec4f vec4f_sub(const vec4f v, const vec4f w) { vec4f out; out.m = _mm_sub_ps(v.m, w.m); return out; }
static vec4f vec4f_mul(const float f, const vec4f v) { vec4f out; out.m = _mm_mul_ps(v.m, _mm_set_ps1(f)); return out; }

static vec4f vec4f_elem_mul(const vec4f v, const vec4f w)
{
	vec4f out;
	out.m = _mm_mul_ps(v.m, w.m);
	return out;
}
static float vec3f_dot(const vec4f v, const vec4f w)
{
	//x0*x1 + y0*y1 + z0*z1
	vec4f m2 = { _mm_mul_ps(v.m, w.m) };
	return m2.x + m2.y + m2.z;
}
static vec4f vec3f_cross(const vec4f v, const vec4f w)
{
	//y0*z1 - z0*y1
	//z0*x1 - x0*z1
	//x0*y1 - y0*x1
	__m128 yzxw0 = _mm_shuffle_ps(v.m, v.m, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 yzxw1 = _mm_shuffle_ps(w.m, w.m, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 zxyw0 = _mm_shuffle_ps(v.m, v.m, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 zxyw1 = _mm_shuffle_ps(w.m, w.m, _MM_SHUFFLE(3, 1, 0, 2));
	vec4f out;
	out.m = _mm_sub_ps(_mm_mul_ps(yzxw0, zxyw1),
					   _mm_mul_ps(zxyw0, yzxw1));
	return out;
}
static float vec3f_length(const vec4f v)
{
	return sqrtf(vec3f_dot(v, v));
}
static vec4f vec3f_normalized(const vec4f v)
{
	vec4f out;
	out.m = _mm_div_ps(v.m, _mm_set_ps1(vec3f_length(v)));
	return out;
}

__declspec(align(16)) struct _yrtag_vec4i
{
	union
	{
		__m128i m;
		float c[4];
		struct
		{
			int32_t x;
			int32_t y;
			int32_t z;
			int32_t w;
		};
	};
};
typedef struct _yrtag_vec4i vec4i;

static vec4i vec4i_neg(const vec4i v) { vec4i out; out.m =_mm_sub_epi32(_mm_setzero_si128(), v.m); return out; }
static vec4i vec4i_add(const vec4i v, const vec4i w) { vec4i out; out.m = _mm_add_epi32(v.m, w.m); return out; }
static vec4i vec4i_sub(const vec4i v, const vec4i w) { vec4i out; out.m = _mm_sub_epi32(v.m, w.m); return out; }

static vec4i vec4i_from_vec4f(const vec4f v) { vec4i out; out.m = _mm_cvtps_epi32(v.m); return out; }
static vec4f vec4f_from_vec4i(const vec4i v) { vec4f out; out.m = _mm_cvtepi32_ps(v.m); return out; }

struct _yrtag_mat4f
{
	vec4f col[4];
};
typedef struct _yrtag_mat4f mat4f;

static mat4f mat34f_rows_convert(const float* m34)
{
	mat4f out;
	for(size_t col = 0; col < 4; ++col)
	for(size_t row = 0; row < 3; ++row)
		out.col[col].c[row] = m34[row*4 + col];
	out.col[0].c[3] = 0.0f;
	out.col[1].c[3] = 0.0f;
	out.col[2].c[3] = 0.0f;
	out.col[3].c[3] = 1.0f;
	return out;
}

static mat4f mat44f_rows_convert(const float* m44)
{
	mat4f out;
	for(size_t col = 0; col < 4; ++col)
	for(size_t row = 0; row < 4; ++row)
		out.col[col].c[row] = m44[row*4 + col];
	return out;
}

static vec4f mat4f_apply(const mat4f mat, vec4f vec)
{
	mat4f m;
	for(size_t i = 0; i < 4; ++i)
		m.col[i] = vec4f_mul(vec.c[i], mat.col[i]);
	vec4f out;
	out.m = _mm_add_ps(
		_mm_add_ps(m.col[0].m, m.col[1].m),
		_mm_add_ps(m.col[2].m, m.col[3].m));
	return out;
}

static mat4f mat4f_combine(const mat4f mat_a, const mat4f mat_b)
{
	mat4f out;
	__m128 t[4];
	mat4f at = mat_a;
	_MM_TRANSPOSE4_PS(at.col[0].m, at.col[1].m, at.col[2].m, at.col[3].m);

	for(size_t c = 0; c < 4; ++c) {
		for(size_t r = 0; r < 4; ++r) {
			t[r] = _mm_mul_ps(at.col[r].m, mat_b.col[c].m);
		}
		_MM_TRANSPOSE4_PS(t[0], t[1], t[2], t[3]);
		out.col[c].m = _mm_add_ps(_mm_add_ps(t[0], t[1]),
								  _mm_add_ps(t[2], t[3]));
	}
	return out;
}

//only for transformation matrices! (det=1)
static mat4f mat4f_invert(const mat4f m)
{
	/*
	* [a b c x]
	* [d e f y]
	* [g h i z]
	* [0 0 0 1]
	*
	* D = aei + bfg + cdh - afh - bdi - ceg = 1.0f because this a transformation matrix
	*
	*
	* M11 = ei-fh
	* M12 = hc-ib
	* ...
	*
	*		[+M11 -M12 +M13]
	* co =	[-M21 +M22 -M23]
	*		[+M31 -M32 +M33]
	*
	* inv = co/D
	*/

	mat4f out = m;
	vec4f off = m.col[3];
	_MM_TRANSPOSE4_PS(out.col[0].m, out.col[1].m, out.col[2].m, out.col[3].m)
	__m128 bcax = _mm_shuffle_ps(out.col[0].m, out.col[0].m, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 efdy = _mm_shuffle_ps(out.col[1].m, out.col[1].m, _MM_SHUFFLE(3, 0, 2, 1));
	__m128 higz = _mm_shuffle_ps(out.col[2].m, out.col[2].m, _MM_SHUFFLE(3, 0, 2, 1));

	__m128 cabx = _mm_shuffle_ps(out.col[0].m, out.col[0].m, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 fdey = _mm_shuffle_ps(out.col[1].m, out.col[1].m, _MM_SHUFFLE(3, 1, 0, 2));
	__m128 ighz = _mm_shuffle_ps(out.col[2].m, out.col[2].m, _MM_SHUFFLE(3, 1, 0, 2));

	out.col[0].m = _mm_sub_ps(_mm_mul_ps(efdy, ighz), _mm_mul_ps(higz, fdey));
	out.col[1].m = _mm_sub_ps(_mm_mul_ps(higz, cabx), _mm_mul_ps(bcax, ighz));
	out.col[2].m = _mm_sub_ps(_mm_mul_ps(bcax, fdey), _mm_mul_ps(efdy, cabx));

	//apply the inverse rotation to the inverted offset
	off.w = 0.0f;
	out.col[3] = mat4f_apply(out, vec4f_neg(off));
	out.col[3].c[3] = 1.0f;
	return out;
}
