//
// Created by Eugeny Grishul
//
// See license at http://bamelg.com/license.txt
//

using System.Runtime;
using System.Runtime.InteropServices;

namespace System.Numerics {
	[Alignment( Boundary = 16 )]
	public struct Matrix4x4 {

		public float[4, 4] Value;

		public Matrix4x4() { }
		public Matrix4x4( float[16]& values ) { Memory.Copy( Value, values, sizeof( Value ) ); }
		public Matrix4x4( float[4, 4]& values ) { Memory.Copy( Value, values, sizeof( values ) ); }

		public static readonly Matrix4x4 Identity = new Matrix4x4 {
			Value = new float[4, 4] {
				{ 1, 0, 0, 0 },
				{ 0, 1, 0, 0 },
				{ 0, 0, 1, 0 },
				{ 0, 0, 0, 1 }
			}
		};

		public static Vector4 operator *( Vector4& vector, Matrix4x4& matrix ) {
			return new Vector4( vector.X * matrix.Value[0, 0] + vector.Y * matrix.Value[0, 1] + vector.Z * matrix.Value[0, 2] + vector.W * matrix.Value[0, 3], vector.X * matrix.Value[1, 0] + vector.Y * matrix.Value[1, 1] + vector.Z * matrix.Value[1, 2] + vector.W * matrix.Value[1, 3], vector.X * matrix.Value[2, 0] + vector.Y * matrix.Value[2, 1] + vector.Z * matrix.Value[2, 2] + vector.W * matrix.Value[2, 3], vector.X * matrix.Value[3, 0] + vector.Y * matrix.Value[3, 1] + vector.Z * matrix.Value[3, 2] + vector.W * matrix.Value[3, 3] );
		}

		internal const CallingConvention DefaultCallingConvention = CallingConvention.Default;
		// internal const CallingConvention DefaultCallingConvention = Environment.CurrentCpu == System.Runtime.CompilerServices.CpuID.X86_32 ? CallingConvention.FastCallX86 : CallingConvention.Default;

		[CallingConvention( Value = DefaultCallingConvention )]
		public static Vector4 operator *( Matrix4x4& matrix, Vector4& vector ) asm {
			X86_32 {
				feature( SSE2 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - matrix
						// edx - vector
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // matrix
						mov edx, [esp + 8]  // vector
						mov eax, [esp + 12] // result
					}

					movaps xmm4, [edx]

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					pshufd xmm5, xmm4, 0b01010101
					pshufd xmm6, xmm4, 0b10101010
					pshufd xmm7, xmm4, 0b11111111
					pshufd xmm4, xmm4, 0b00000000

					mulps xmm0, xmm4
					mulps xmm1, xmm5
					mulps xmm2, xmm6
					mulps xmm3, xmm7

					addps xmm0, xmm1
					addps xmm2, xmm3
					addps xmm0, xmm2

					movaps [eax], xmm0
					retn
				}
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - matrix
						// edx - vector
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // matrix
						mov edx, [esp + 8]  // vector
						mov eax, [esp + 12] // result
					}

					movaps xmm4, [edx]

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					movaps xmm5, xmm4
					movaps xmm6, xmm4
					movaps xmm7, xmm4

					shufps xmm4, xmm4, 0b00000000
					shufps xmm5, xmm5, 0b01010101
					shufps xmm6, xmm6, 0b10101010
					shufps xmm7, xmm7, 0b11111111

					mulps xmm0, xmm4
					mulps xmm1, xmm5
					mulps xmm2, xmm6
					mulps xmm3, xmm7

					addps xmm0, xmm1
					addps xmm2, xmm3
					addps xmm0, xmm2

					movaps [eax], xmm0
					retn
				}
				feature( AMD3DNow ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - matrix
						// edx - vector
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // matrix
						mov edx, [esp + 8]  // vector
						mov eax, [esp + 12] // result
					}

					pshufw mm4, [edx], 0b01000100
					pshufw mm5, [edx], 0b11101110
					pshufw mm6, [edx + 8], 0b01000100
					pshufw mm7, [edx + 8], 0b11101110

					movq mm0, [ecx]
					movq mm1, [ecx + 16]
					movq mm2, [ecx + 32]
					movq mm3, [ecx + 48]

					pfmul mm0, mm4
					pfmul mm1, mm5
					pfmul mm2, mm6
					pfmul mm3, mm7

					pfadd mm0, mm1
					pfadd mm2, mm3
					pfadd mm0, mm2

					movq [eax], mm0

					movq mm0, [ecx + 8]
					movq mm1, [ecx + 24]
					movq mm2, [ecx + 40]
					movq mm3, [ecx + 56]

					pfmul mm0, mm4
					pfmul mm1, mm5
					pfmul mm2, mm6
					pfmul mm3, mm7

					pfadd mm0, mm1
					pfadd mm2, mm3
					pfadd mm0, mm2

					movq [eax + 8], mm0

					femms
					retn
				}
				default {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - matrix
						// edx - vector
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // matrix
						mov edx, [esp + 8]  // vector
						mov eax, [esp + 12] // result
					}

					fld dword ptr [ecx]
					fld dword ptr [ecx + 4]
					fld dword ptr [ecx + 8]
					fld dword ptr [ecx + 12]

					fld dword ptr [edx]

					fmul st4, st0
					fmul st3, st0
					fmul st2, st0
					fmulp st1, st0

					fld dword ptr [ecx + 16]
					fld dword ptr [ecx + 20]
					fld dword ptr [ecx + 24]

					fld dword ptr [edx + 4]

					fmul st3, st0
					fmul st2, st0
					fmul st1, st0
					fmul dword ptr [ecx + 28]

					faddp st4, st0
					faddp st4, st0
					faddp st4, st0
					faddp st4, st0

					fld dword ptr [ecx + 32]
					fld dword ptr [ecx + 36]
					fld dword ptr [ecx + 40]

					fld dword ptr [edx + 8]

					fmul st3, st0
					fmul st2, st0
					fmul st1, st0
					fmul dword ptr [ecx + 44]

					faddp st4, st0
					faddp st4, st0
					faddp st4, st0
					faddp st4, st0

					fld dword ptr [ecx + 48]
					fld dword ptr [ecx + 52]
					fld dword ptr [ecx + 56]

					fld dword ptr [edx + 12]

					fmul st3, st0
					fmul st2, st0
					fmul st1, st0
					fmul dword ptr [ecx + 60]

					faddp st4, st0
					faddp st4, st0
					faddp st4, st0
					faddp st4, st0

					fstp dword ptr [eax + 12]
					fstp dword ptr [eax + 8]
					fstp dword ptr [eax + 4]
					fstp dword ptr [eax]

					retn
				}
			}
			default {
				return new Vector4( vector.X * matrix.Value[0, 0] + vector.Y * matrix.Value[1, 0] + vector.Z * matrix.Value[2, 0] + vector.W * matrix.Value[3, 0], vector.X * matrix.Value[0, 1] + vector.Y * matrix.Value[1, 1] + vector.Z * matrix.Value[2, 1] + vector.W * matrix.Value[3, 1], vector.X * matrix.Value[0, 2] + vector.Y * matrix.Value[1, 2] + vector.Z * matrix.Value[2, 2] + vector.W * matrix.Value[3, 2], vector.X * matrix.Value[0, 3] + vector.Y * matrix.Value[1, 3] + vector.Z * matrix.Value[2, 3] + vector.W * matrix.Value[3, 3] );
			}
		}

		public Matrix4x4& Inverse() {
			this = Inversed;
			return this;
		}

		[CallingConvention( Value = DefaultCallingConvention )]
		public Matrix4x4& Transpose() asm {
			X86_32 {
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - this
					}
					else {
						mov ecx, [esp + 4] // this
					}

					movaps xmm0, [ecx]
					movaps xmm2, [ecx + 32]

					movaps xmm1, xmm0
					movaps xmm3, xmm2

					unpcklps xmm0, [ecx + 16]
					unpckhps xmm1, [ecx + 16]

					unpcklps xmm2, [ecx + 48]
					unpckhps xmm3, [ecx + 48]

					movaps xmm4, xmm0
					movaps xmm6, xmm1

					shufps xmm0, xmm2, 0b01000100
					shufps xmm4, xmm2, 0b11101110
					shufps xmm1, xmm3, 0b01000100
					shufps xmm6, xmm3, 0b11101110

					movaps [ecx], xmm0
					movaps [ecx + 16], xmm4
					movaps [ecx + 32], xmm1
					movaps [ecx + 48], xmm6

					retn
				}
				feature( MMX ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - this
					}
					else {
						mov ecx, [esp + 4] // this
					}

					movq mm0, [ecx]
					movq mm1, [ecx + 8]
					movq mm4, [ecx + 32]
					movq mm5, [ecx + 40]

					movq mm2, mm0
					movq mm3, mm1
					movq mm6, mm4
					movq mm7, mm5

					punpckldq mm0, [ecx + 16]
					punpckhdq mm2, [ecx + 16]
					punpckldq mm1, [ecx + 24]
					punpckhdq mm3, [ecx + 24]

					punpckldq mm4, [ecx + 48]
					punpckhdq mm6, [ecx + 48]
					punpckldq mm5, [ecx + 56]
					punpckhdq mm7, [ecx + 56]

					movq [ecx], mm0
					movq [ecx + 8], mm4
					movq [ecx + 16], mm2
					movq [ecx + 24], mm6

					movq [ecx + 32], mm1
					movq [ecx + 40], mm5
					movq [ecx + 48], mm3
					movq [ecx + 56], mm7

					emms
					retn
				}
				default {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - this
					}
					else {
						mov ecx, [esp + 4] // this
					}

					fld dword ptr [ecx + 4]
					fld dword ptr [ecx + 8]
					fld dword ptr [ecx + 12]

					fld dword ptr [ecx + 16]
					fld dword ptr [ecx + 32]
					fld dword ptr [ecx + 48]

					fstp dword ptr [edx + 12]
					fstp dword ptr [edx + 8]
					fstp dword ptr [edx + 4]

					fstp dword ptr [edx + 48]
					fstp dword ptr [edx + 32]
					fstp dword ptr [edx + 16]

					fld dword ptr [ecx + 24]
					fld dword ptr [ecx + 28]
					fld dword ptr [ecx + 44]

					fld dword ptr [ecx + 36]
					fld dword ptr [ecx + 52]
					fld dword ptr [ecx + 56]

					fstp dword ptr [edx + 44]
					fstp dword ptr [edx + 28]
					fstp dword ptr [edx + 24]

					fstp dword ptr [edx + 56]
					fstp dword ptr [edx + 52]
					fstp dword ptr [edx + 36]

					retn
				}
			}
			default {
				this = Transposed;
				return this;
			}
		}

		public Matrix4x4 Transposed {
			[CallingConvention( Value = DefaultCallingConvention )]
			get asm {
				X86_32 {
					feature( SSE1 ) {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
							// edx - result
						}
						else {
							mov ecx, [esp + 4] // this
							mov edx, [esp + 8] // result
						}

						movups xmm0, [ecx]
						movups xmm2, [ecx + 32]

						movups xmm1, xmm0
						movups xmm3, xmm2

						unpcklps xmm0, [ecx + 16]
						unpckhps xmm1, [ecx + 16]

						unpcklps xmm2, [ecx + 48]
						unpckhps xmm3, [ecx + 48]

						movaps xmm4, xmm0
						movaps xmm6, xmm1

						shufps xmm0, xmm2, 0b01000100
						shufps xmm4, xmm2, 0b11101110
						shufps xmm1, xmm3, 0b01000100
						shufps xmm6, xmm3, 0b11101110

						movups [edx], xmm0
						movups [edx + 16], xmm4
						movups [edx + 32], xmm1
						movups [edx + 48], xmm6

						retn
					}
					feature( MMX ) {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
							// edx - result
						}
						else {
							mov ecx, [esp + 4] // this
							mov edx, [esp + 8] // result
						}

						movq mm0, [ecx]
						movq mm1, [ecx + 8]
						movq mm4, [ecx + 32]
						movq mm5, [ecx + 40]

						movq mm2, mm0
						movq mm3, mm1
						movq mm6, mm4
						movq mm7, mm5

						punpckldq mm0, [ecx + 16]
						punpckhdq mm2, [ecx + 16]
						punpckldq mm1, [ecx + 24]
						punpckhdq mm3, [ecx + 24]

						punpckldq mm4, [ecx + 48]
						punpckhdq mm6, [ecx + 48]
						punpckldq mm5, [ecx + 56]
						punpckhdq mm7, [ecx + 56]

						movq [edx], mm0
						movq [edx + 8], mm4
						movq [edx + 16], mm2
						movq [edx + 24], mm6

						movq [edx + 32], mm1
						movq [edx + 40], mm5
						movq [edx + 48], mm3
						movq [edx + 56], mm7

						emms
						retn
					}
					default {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
							// edx - result
						}
						else {
							mov ecx, [esp + 4] // this
							mov edx, [esp + 8] // result
						}

						fld dword ptr [ecx + 4]
						fld dword ptr [ecx + 8]
						fld dword ptr [ecx + 12]

						fld dword ptr [ecx + 16]
						fld dword ptr [ecx + 32]
						fld dword ptr [ecx + 48]

						fstp dword ptr [edx + 12]
						fstp dword ptr [edx + 8]
						fstp dword ptr [edx + 4]

						fstp dword ptr [edx + 48]
						fstp dword ptr [edx + 32]
						fstp dword ptr [edx + 16]

						fld dword ptr [ecx + 24]
						fld dword ptr [ecx + 28]
						fld dword ptr [ecx + 44]

						fld dword ptr [ecx + 36]
						fld dword ptr [ecx + 52]
						fld dword ptr [ecx + 56]

						fstp dword ptr [edx + 44]
						fstp dword ptr [edx + 28]
						fstp dword ptr [edx + 24]

						fstp dword ptr [edx + 56]
						fstp dword ptr [edx + 52]
						fstp dword ptr [edx + 36]

						cmp ecx, edx
						jz ResultAndSourceSame

						fld dword ptr [ecx]
						fld dword ptr [ecx + 20]
						fld dword ptr [ecx + 40]
						fld dword ptr [ecx + 60]

						fstp dword ptr [edx + 60]
						fstp dword ptr [edx + 40]
						fstp dword ptr [edx + 20]
						fstp dword ptr [edx]

					ResultAndSourceSame:
						retn
					}
				}
				default {
					Matrix4x4 result;

					[Unroll]
					for( var i = 0; i < 4; ++i )
						[Unroll]
						for( var j = 0; j < 4; ++j )
							result.Value[i, j] = Value[j, i];

					return result;
				}
			}
		}

		public Matrix4x4 Inversed {
			[CallingConvention( Value = DefaultCallingConvention )]
			get asm {
				X86_32 {
					feature( SSE1 ) {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
							// edx - result
						}
						else {
							mov ecx, [esp + 4] // this
							mov edx, [esp + 8] // result
						}

						mov eax, esp
						and esp, -16
						sub esp, 0xB0

						movlps xmm2, [ecx + 8]
						movlps xmm4, [ecx + 40]
						movhps xmm2, [ecx + 24]
						movhps xmm4, [ecx + 56]
						movlps xmm3, [ecx + 32]
						movlps xmm1, [ecx]
						movhps xmm3, [ecx + 48]
						movhps xmm1, [ecx + 16]
						movaps xmm5, xmm2
						shufps xmm5, xmm4, 0b10001000
						shufps xmm4, xmm2, 0b11011101
						movaps xmm2, xmm4
						mulps xmm2, xmm5
						shufps xmm2, xmm2, 0b10110001
						movaps xmm6, xmm2
						shufps xmm6, xmm6, 0b01001110
						movaps xmm7, xmm3
						shufps xmm3, xmm1, 0b11011101
						shufps xmm1, xmm7, 0b10001000
						movaps xmm7, xmm3
						mulps xmm3, xmm6
						mulps xmm6, xmm1
						movaps xmm0, xmm6
						movaps xmm6, xmm7
						mulps xmm7, xmm2
						mulps xmm2, xmm1
						subps xmm3, xmm7
						movaps xmm7, xmm6
						mulps xmm7, xmm5
						shufps xmm5, xmm5, 0b01001110
						shufps xmm7, xmm7, 0b10110001
						movaps [esp + 16], xmm2
						movaps xmm2, xmm4
						mulps xmm2, xmm7
						addps xmm2, xmm3
						movaps xmm3, xmm7
						shufps xmm7, xmm7, 0b01001110
						mulps xmm3, xmm1
						movaps [esp + 32], xmm3
						movaps xmm3, xmm4
						mulps xmm3, xmm7
						mulps xmm7, xmm1
						subps xmm2, xmm3
						movaps xmm3, xmm6
						shufps xmm3, xmm3, 0b01001110
						mulps xmm3, xmm4
						shufps xmm3, xmm3, 0b10110001
						movaps [esp + 48], xmm7
						movaps xmm7, xmm5
						mulps xmm5, xmm3
						addps xmm5, xmm2
						movaps xmm2, xmm3
						shufps xmm3, xmm3, 0b01001110
						mulps xmm2, xmm1
						movaps [esp + 64], xmm4
						movaps xmm4, xmm7
						mulps xmm7, xmm3
						mulps xmm3, xmm1
						subps xmm5, xmm7
						subps xmm3, xmm2
						movaps xmm2, xmm1
						mulps xmm1, xmm5
						shufps xmm3, xmm3, 0b01001110
						movaps xmm7, xmm1
						shufps xmm1, xmm1, 0b01001110
						movaps [esp], xmm5
						addps xmm1, xmm7
						movaps xmm5, xmm1
						shufps xmm1, xmm1, 0b10110001
						addss xmm1, xmm5
						movaps xmm5, xmm6
						mulps xmm5, xmm2
						shufps xmm5, xmm5, 0b10110001
						movaps xmm7, xmm5
						shufps xmm5, xmm5, 0b01001110
						movaps [esp + 80], xmm4
						movaps xmm4, [esp + 64]
						movaps [esp + 64], xmm6
						movaps xmm6, xmm4
						mulps xmm6, xmm7
						addps xmm6, xmm3
						movaps xmm3, xmm4
						mulps xmm3, xmm5
						subps xmm3, xmm6
						movaps xmm6, xmm4
						mulps xmm6, xmm2
						shufps xmm6, xmm6, 0b10110001
						movaps [esp + 112], xmm5
						movaps xmm5, [esp + 64]
						movaps [esp + 128], xmm7
						movaps xmm7, xmm6
						mulps xmm7, xmm5
						addps xmm7, xmm3
						movaps xmm3, xmm6
						shufps xmm3, xmm3, 0b01001110
						movaps [esp + 144], xmm4
						movaps xmm4, xmm5
						mulps xmm5, xmm3
						movaps [esp + 160], xmm4
						movaps xmm4, xmm6
						movaps xmm6, xmm7
						subps xmm6, xmm5
						movaps xmm5, xmm0
						movaps xmm7, [esp + 16]
						subps xmm5, xmm7
						shufps xmm5, xmm5, 0b01001110
						movaps xmm7, [esp + 80]
						mulps xmm4, xmm7
						mulps xmm3, xmm7
						subps xmm5, xmm4
						mulps xmm2, xmm7
						addps xmm3, xmm5
						shufps xmm2, xmm2, 0b10110001
						movaps xmm4, xmm2
						shufps xmm4, xmm4, 0b01001110
						movaps xmm5, [esp + 144]
						movaps xmm0, xmm6
						movaps xmm6, xmm5
						mulps xmm5, xmm2
						mulps xmm6, xmm4
						addps xmm5, xmm3
						movaps xmm3, xmm4
						movaps xmm4, xmm5
						subps xmm4, xmm6
						movaps xmm5, [esp + 48]
						movaps xmm6, [esp + 32]
						subps xmm5, xmm6
						shufps xmm5, xmm5, 0b01001110
						movaps xmm6, [esp + 128]
						mulps xmm6, xmm7
						subps xmm6, xmm5
						movaps xmm5, [esp + 112]
						mulps xmm7, xmm5
						subps xmm6, xmm7
						movaps xmm5, [esp + 160]
						mulps xmm2, xmm5
						mulps xmm5, xmm3
						subps xmm6, xmm2
						movaps xmm2, xmm5
						addps xmm2, xmm6
						movaps xmm6, xmm0
						movaps xmm0, xmm1
						movaps xmm1, [esp]
						movaps xmm3, xmm0

						rcpss xmm5, xmm0
						mulss xmm0, xmm5
						mulss xmm0, xmm5
						addss xmm5, xmm5
						subss xmm5, xmm0
						movaps xmm0, xmm5

						addss xmm5, xmm5
						mulss xmm0, xmm0
						mulss xmm3, xmm0
						subss xmm5, xmm3
						shufps xmm5, xmm5, 0
						mulps xmm1, xmm5
						mulps xmm4, xmm5
						mulps xmm6, xmm5
						mulps xmm5, xmm2

						movaps [edx], xmm1
						movaps [edx + 16], xmm4
						movaps [edx + 32], xmm6
						movaps [edx + 48], xmm5

						mov esp, eax
						retn
					}
				}
				default {
					Matrix4x4 result;
					float[12] pairs;

					pairs[0] = Value[2, 2] * Value[3, 3];
					pairs[1] = Value[3, 2] * Value[2, 3];
					pairs[2] = Value[1, 2] * Value[3, 3];
					pairs[3] = Value[3, 2] * Value[1, 3];
					pairs[4] = Value[1, 2] * Value[2, 3];
					pairs[5] = Value[2, 2] * Value[1, 3];
					pairs[6] = Value[0, 2] * Value[3, 3];
					pairs[7] = Value[3, 2] * Value[0, 3];
					pairs[8] = Value[0, 2] * Value[2, 3];
					pairs[9] = Value[2, 2] * Value[0, 3];
					pairs[10] = Value[0, 2] * Value[1, 3];
					pairs[11] = Value[1, 2] * Value[0, 3];

					result.Value[0, 0] = ( pairs[0] * Value[1, 1] + pairs[3] * Value[2, 1] + pairs[4] * Value[3, 1] ) - ( pairs[1] * Value[1, 1] + pairs[2] * Value[2, 1] + pairs[5] * Value[3, 1] );
					result.Value[0, 1] = ( pairs[1] * Value[0, 1] + pairs[6] * Value[2, 1] + pairs[9] * Value[3, 1] ) - ( pairs[0] * Value[0, 1] + pairs[7] * Value[2, 1] + pairs[8] * Value[3, 1] );
					result.Value[0, 2] = ( pairs[2] * Value[0, 1] + pairs[7] * Value[1, 1] + pairs[10] * Value[3, 1] ) - ( pairs[3] * Value[0, 1] + pairs[6] * Value[1, 1] + pairs[11] * Value[3, 1] );
					result.Value[0, 3] = ( pairs[5] * Value[0, 1] + pairs[8] * Value[1, 1] + pairs[11] * Value[2, 1] ) - ( pairs[4] * Value[0, 1] + pairs[9] * Value[1, 1] + pairs[10] * Value[2, 1] );
					result.Value[1, 0] = ( pairs[1] * Value[1, 0] + pairs[2] * Value[2, 0] + pairs[5] * Value[3, 0] ) - ( pairs[0] * Value[1, 0] + pairs[3] * Value[2, 0] + pairs[4] * Value[3, 0] );
					result.Value[1, 1] = ( pairs[0] * Value[0, 0] + pairs[7] * Value[2, 0] + pairs[8] * Value[3, 0] ) - ( pairs[1] * Value[0, 0] + pairs[6] * Value[2, 0] + pairs[9] * Value[3, 0] );
					result.Value[1, 2] = ( pairs[3] * Value[0, 0] + pairs[6] * Value[1, 0] + pairs[11] * Value[3, 0] ) - ( pairs[2] * Value[0, 0] + pairs[7] * Value[1, 0] + pairs[10] * Value[3, 0] );
					result.Value[1, 3] = ( pairs[4] * Value[0, 0] + pairs[9] * Value[1, 0] + pairs[10] * Value[2, 0] ) - ( pairs[5] * Value[0, 0] + pairs[8] * Value[1, 0] + pairs[11] * Value[2, 0] );

					pairs[0] = Value[2, 0] * Value[3, 1];
					pairs[1] = Value[3, 0] * Value[2, 1];
					pairs[2] = Value[1, 0] * Value[3, 1];
					pairs[3] = Value[3, 0] * Value[1, 1];
					pairs[4] = Value[1, 0] * Value[2, 1];
					pairs[5] = Value[2, 0] * Value[1, 1];
					pairs[6] = Value[0, 0] * Value[3, 1];
					pairs[7] = Value[3, 0] * Value[0, 1];
					pairs[8] = Value[0, 0] * Value[2, 1];
					pairs[9] = Value[2, 0] * Value[0, 1];
					pairs[10] = Value[0, 0] * Value[1, 1];
					pairs[11] = Value[1, 0] * Value[0, 1];

					result.Value[2, 0] = ( pairs[0] * Value[1, 3] + pairs[3] * Value[2, 3] + pairs[4] * Value[3, 3] ) - ( pairs[1] * Value[1, 3] + pairs[2] * Value[2, 3] + pairs[5] * Value[3, 3] );
					result.Value[2, 1] = ( pairs[1] * Value[0, 3] + pairs[6] * Value[2, 3] + pairs[9] * Value[3, 3] ) - ( pairs[0] * Value[0, 3] + pairs[7] * Value[2, 3] + pairs[8] * Value[3, 3] );
					result.Value[2, 2] = ( pairs[2] * Value[0, 3] + pairs[7] * Value[1, 3] + pairs[10] * Value[3, 3] ) - ( pairs[3] * Value[0, 3] + pairs[6] * Value[1, 3] + pairs[11] * Value[3, 3] );
					result.Value[2, 3] = ( pairs[5] * Value[0, 3] + pairs[8] * Value[1, 3] + pairs[11] * Value[2, 3] ) - ( pairs[4] * Value[0, 3] + pairs[9] * Value[1, 3] + pairs[10] * Value[2, 3] );
					result.Value[3, 0] = ( pairs[2] * Value[2, 2] + pairs[5] * Value[3, 2] + pairs[1] * Value[1, 2] ) - ( pairs[4] * Value[3, 2] + pairs[0] * Value[1, 2] + pairs[3] * Value[2, 2] );
					result.Value[3, 1] = ( pairs[8] * Value[3, 2] + pairs[0] * Value[0, 2] + pairs[7] * Value[2, 2] ) - ( pairs[6] * Value[2, 2] + pairs[9] * Value[3, 2] + pairs[1] * Value[0, 2] );
					result.Value[3, 2] = ( pairs[6] * Value[1, 2] + pairs[11] * Value[3, 2] + pairs[3] * Value[0, 2] ) - ( pairs[10] * Value[3, 2] + pairs[2] * Value[0, 2] + pairs[7] * Value[1, 2] );
					result.Value[3, 3] = ( pairs[10] * Value[2, 2] + pairs[4] * Value[0, 2] + pairs[9] * Value[1, 2] ) - ( pairs[8] * Value[1, 2] + pairs[11] * Value[2, 2] + pairs[5] * Value[0, 2] );

					var determinant = 1f / ( Value[0, 0] * result.Value[0, 0] + Value[1, 0] * result.Value[0, 1] + Value[2, 0] * result.Value[0, 2] + Value[3, 0] * result.Value[0, 3] );

					[Unroll]
					for( var i = 0; i < 4; ++i ) {
						result.Value[i, 0] *= determinant;
						result.Value[i, 1] *= determinant;
						result.Value[i, 2] *= determinant;
						result.Value[i, 3] *= determinant;
					}

					return result;
				}
			}
		}


		[ForceInline]
		public Matrix4x4& Translate( float x, float y, float z ) {
			Value[3, 0] += Value[0, 0] * x + Value[1, 0] * y + Value[2, 0] * z;
			Value[3, 1] += Value[0, 1] * x + Value[1, 1] * y + Value[2, 1] * z;
			Value[3, 2] += Value[0, 2] * x + Value[1, 2] * y + Value[2, 2] * z;
			Value[3, 3] += Value[0, 3] * x + Value[1, 3] * y + Value[2, 3] * z;

			return this;
		}

		[ForceInline]
		public Matrix4x4& Translate( Vector3& vector ) {
			return Translate( vector.X, vector.Y, vector.Z );
		}

		public static bool operator ==( Matrix4x4& left, Matrix4x4& right ) asm {
			X86_32 {
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - left
						// edx - right
					}
					else {
						mov ecx, [esp + 4] // left
						mov edx, [esp + 8] // right
					}

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					cmpeqps xmm0, [edx]
					cmpeqps xmm1, [edx + 16]
					cmpeqps xmm2, [edx + 32]
					cmpeqps xmm3, [edx + 48]

					andps xmm0, xmm1
					andps xmm2, xmm3
					andps xmm0, xmm2

					movmskps eax, xmm0
					cmp eax, 15
					setz al

					retn
				}
				feature( AMD3DNow ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - left
						// edx - right
					}
					else {
						mov ecx, [esp + 4] // left
						mov edx, [esp + 8] // right
					}

					movq mm0, [ecx]
					movq mm1, [ecx + 8]
					movq mm2, [ecx + 16]
					movq mm3, [ecx + 24]
					movq mm4, [ecx + 32]
					movq mm5, [ecx + 40]
					movq mm6, [ecx + 48]
					movq mm7, [ecx + 56]

					pfcmpeq mm0, [edx]
					pfcmpeq mm1, [edx + 8]
					pfcmpeq mm2, [edx + 16]
					pfcmpeq mm3, [edx + 24]
					pfcmpeq mm4, [edx + 32]
					pfcmpeq mm5, [edx + 40]
					pfcmpeq mm6, [edx + 48]
					pfcmpeq mm7, [edx + 56]

					pand mm0, mm1
					pand mm2, mm3
					pand mm4, mm5
					pand mm6, mm7

					pand mm0, mm2
					pand mm4, mm6

					pand mm0, mm4

					packssdw mm0, mm0

					movd ecx, mm0

					femms

					xor eax, eax
					cmp ecx, -1
					setz al

					retn
				}
			}
			default {
				float* value1 = left.Value, value2 = right.Value;

				[Unroll]
				for( int i = 0; i < 16; ++i )
					if( value1[i] != value2[i] )
						return false;

				return true;
			}
		}

		private static uint _mnull = 0x80000001;
		private static int _minus1 = -1;

		[CallingConvention( Value = DefaultCallingConvention )]
		public bool Equals( Matrix4x4& other, int tolerance ) asm {
			X86_32 {
				feature( SSSE3 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - this
						// edx - other
						movss xmm5, [esp + 4] // tolerance
					}
					else {
						mov ecx, [esp + 4]     // this
						mov edx, [esp + 8]     // other
						movss xmm5, [esp + 12] // tolerance
					}

					movss xmm6, [[_mnull]]
					movss xmm7, [[_minus1]]

					movaps xmm0, [ecx]
					movaps xmm1, [edx]

					pshufd xmm5, xmm5, 0b0000000
					pshufd xmm6, xmm6, 0b0000000
					pshufd xmm7, xmm7, 0b0000000

					movaps xmm2, xmm0
					movaps xmm3, xmm1
					movaps xmm4, xmm6

					pcmpgtd xmm2, xmm7
					pcmpgtd xmm3, xmm7

					pxor xmm2, xmm3
					pand xmm4, xmm2
					pxor xmm0, xmm2
					paddd xmm0, xmm4

					psubd xmm0, xmm1
					pabsd xmm0, xmm0
					pcmpgtd xmm0, xmm5

					// movmskps performs better
					// ptest xmm0, xmm0
					// jnz not_equal

					movmskps eax, xmm0
					test eax, eax
					jnz NotEqual

					//

					movaps xmm0, [ecx + 16]
					movaps xmm1, [edx + 16]

					movaps xmm2, xmm0
					movaps xmm3, xmm1
					movaps xmm4, xmm6

					pcmpgtd xmm2, xmm7
					pcmpgtd xmm3, xmm7

					pxor xmm2, xmm3
					pand xmm4, xmm2
					pxor xmm0, xmm2
					paddd xmm0, xmm4

					psubd xmm0, xmm1
					pabsd xmm0, xmm0
					pcmpgtd xmm0, xmm5

					movmskps eax, xmm0
					test eax, eax
					jnz NotEqual

					//

					movaps xmm0, [ecx + 32]
					movaps xmm1, [edx + 32]

					movaps xmm2, xmm0
					movaps xmm3, xmm1
					movaps xmm4, xmm6

					pcmpgtd xmm2, xmm7
					pcmpgtd xmm3, xmm7

					pxor xmm2, xmm3
					pand xmm4, xmm2
					pxor xmm0, xmm2
					paddd xmm0, xmm4

					psubd xmm0, xmm1
					pabsd xmm0, xmm0
					pcmpgtd xmm0, xmm5

					movmskps eax, xmm0
					test eax, eax
					jnz NotEqual

					//

					movaps xmm0, [ecx + 48]
					movaps xmm1, [edx + 48]

					movaps xmm2, xmm0
					movaps xmm3, xmm1
					movaps xmm4, xmm6

					pcmpgtd xmm2, xmm7
					pcmpgtd xmm3, xmm7

					pxor xmm2, xmm3
					pand xmm4, xmm2
					pxor xmm0, xmm2
					paddd xmm0, xmm4

					psubd xmm0, xmm1
					pabsd xmm0, xmm0
					pcmpgtd xmm0, xmm5

					movmskps eax, xmm0
					test eax, eax
					jnz NotEqual

					setz al
					retn

				NotEqual:
					xor eax, eax
					retn
				}
			}
			default {
				float* value1 = Value, value2 = other.Value;

				[Unroll]
				for( int i = 0; i < 16; ++i )
					if( !float.AreEqual( value1[i], value2[i], tolerance ) )
						return false;

				return true;
			}
		}

		public static bool operator !=( Matrix4x4& left, Matrix4x4& right ) {
			return !( left == right );
		}

		public float Determinant {
			[CallingConvention( Value = DefaultCallingConvention )]
			get asm {
				X86_32 {
					feature( SSE1 ) {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
						}
						else {
							mov ecx, [esp + 4] // this
						}

						movaps xmm0, [ecx + 32]
						movaps xmm1, [ecx + 48]
						movaps xmm2, [ecx + 16]

						movaps xmm3, xmm0
						movaps xmm4, xmm0
						movaps xmm6, xmm1
						movaps xmm7, xmm2

						shufps xmm0, xmm0, 0b00011011
						shufps xmm1, xmm1, 0b10110001
						shufps xmm2, xmm2, 0b01001110
						shufps xmm7, xmm7, 0b00111001
						mulps xmm0, xmm1
						shufps xmm3, xmm3, 0b01111101
						shufps xmm6, xmm6, 0b00001010
						movaps xmm5, xmm0
						shufps xmm0, xmm0, 0b01001110
						shufps xmm4, xmm4, 0b00001010
						shufps xmm1, xmm1, 0b00101000
						subps xmm5, xmm0
						mulps xmm3, xmm6
						mulps xmm4, xmm1
						mulps xmm5, xmm2
						shufps xmm2, xmm2, 0b00111001
						subps xmm3, xmm4
						movaps xmm0, xmm3
						shufps xmm0, xmm0, 0b00111001
						mulps xmm3, xmm2
						mulps xmm0, xmm7
						addps xmm5, xmm3
						subps xmm5, xmm0
						mulps xmm5, [ecx]
						movhlps xmm7, xmm5
						addps xmm5, xmm7
						movaps xmm6, xmm5
						shufps xmm6, xmm6, 0b00000001
						addss xmm5, xmm6

						mov eax, [esp]
						movss [esp], xmm5
						fld dword ptr [esp]
						pop ecx
						jmp eax
					}
					feature( AMD3DNow ) {
						if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
							// ecx - this
						}
						else {
							mov ecx, [esp + 4] // this
						}

						movq mm2, qword ptr [ecx + 0x20]
						movq mm1, qword ptr [ecx + 0x28]
						movq mm0, qword ptr [ecx + 0x24]

						punpckhdq mm1, mm1
						punpckldq mm2, mm2

						movq mm5, qword ptr [ecx + 0x30]
						movq mm4, qword ptr [ecx + 0x38]
						movq mm3, qword ptr [ecx + 0x34]

						punpckhdq mm4, mm4
						punpckldq mm5, mm5

						pfmul mm4, mm0
						pfmul mm1, mm3
						pfsub mm4, mm1
						pfmul mm2, qword ptr [ecx + 0x38]
						pfmul mm5, qword ptr [ecx + 0x28]
						pfsub mm2, mm5
						pfmul mm3, qword ptr [ecx + 0x20]
						pfmul mm0, qword ptr [ecx + 0x30]
						pfsub mm3, mm0
						movq mm0, mm2
						punpckhdq mm0, mm0
						punpckldq mm0, mm4
						movq mm1, mm3
						punpckhdq mm1, mm4
						movq mm5, mm3
						punpckldq mm5, mm2
						movq mm6, qword ptr [ecx + 0x18]
						movq mm7, mm6
						punpckhdq mm6, mm6
						pfmul mm4, qword ptr [ecx + 0x10]
						pfmul mm3, mm6
						pfadd mm4, mm3
						movq mm3, qword ptr [ecx + 0x14]
						pfmul mm0, mm3
						pfsub mm4, mm0
						movq mm0, qword ptr [ecx + 0x10]
						punpckldq mm0, mm0
						movq mm6, qword ptr [ecx + 8]
						punpckldq mm6, dword ptr [ecx]
						pfmul mm4, mm6
						movq mm6, qword ptr [ecx + 8]
						punpckhdq mm6, qword ptr [ecx]
						pfmul mm5, mm7
						pfmul mm2, mm3
						pfmul mm1, mm0
						pfadd mm1, mm5
						pfsub mm1, mm2
						pfmul mm1, mm6
						pfsub mm4, mm1
						pfacc mm4, mm4

						mov eax, [esp]
						movd dword ptr [esp], mm4
						femms

						fld dword ptr [esp]
						pop ecx

						jmp eax
					}
				}
				default {
					float[12] pairs;

					pairs[0] = Value[2, 2] * Value[3, 3];
					pairs[1] = Value[3, 2] * Value[2, 3];
					pairs[2] = Value[1, 2] * Value[3, 3];
					pairs[3] = Value[3, 2] * Value[1, 3];
					pairs[4] = Value[1, 2] * Value[2, 3];
					pairs[5] = Value[2, 2] * Value[1, 3];
					pairs[6] = Value[0, 2] * Value[3, 3];
					pairs[7] = Value[3, 2] * Value[0, 3];
					pairs[8] = Value[0, 2] * Value[2, 3];
					pairs[9] = Value[2, 2] * Value[0, 3];
					pairs[10] = Value[0, 2] * Value[1, 3];
					pairs[11] = Value[1, 2] * Value[0, 3];

					var x = pairs[0] * Value[1, 1] + pairs[3] * Value[2, 1] + pairs[4] * Value[3, 1] - pairs[1] * Value[1, 1] - pairs[2] * Value[2, 1] - pairs[5] * Value[3, 1];
					var y = pairs[1] * Value[0, 1] + pairs[6] * Value[2, 1] + pairs[9] * Value[3, 1] - pairs[0] * Value[0, 1] - pairs[7] * Value[2, 1] - pairs[8] * Value[3, 1];
					var z = pairs[2] * Value[0, 1] + pairs[7] * Value[1, 1] + pairs[10] * Value[3, 1] - pairs[3] * Value[0, 1] - pairs[6] * Value[2, 1] - pairs[11] * Value[3, 1];
					var w = pairs[5] * Value[0, 1] + pairs[8] * Value[1, 1] + pairs[11] * Value[2, 1] - pairs[4] * Value[0, 1] - pairs[9] * Value[1, 1] - pairs[10] * Value[2, 1];

					return Value[0, 0] * x + Value[1, 0] * y + Value[2, 0] * z + Value[3, 0] * w;
				}
			}
		}

		public Matrix4x4& Scale( float x, float y, float z ) {
			Value[0, 0] *= x;
			Value[0, 1] *= x;
			Value[0, 2] *= x;
			Value[0, 3] *= x;

			Value[1, 0] *= y;
			Value[1, 1] *= y;
			Value[1, 2] *= y;
			Value[1, 3] *= y;

			Value[2, 0] *= z;
			Value[2, 1] *= z;
			Value[2, 2] *= z;
			Value[2, 3] *= z;

			return this;
		}

		public Matrix4x4& RotateX( float angle ) {
			float sin = Math.Sin( angle ), cos = Math.Cos( angle );

			float[8] temp;
			Memory.Copy( temp, &Value[1, 0], sizeof( temp ) );

			Value[1, 0] = temp[0] * cos + temp[4] * sin;
			Value[1, 1] = temp[1] * cos + temp[5] * sin;
			Value[1, 2] = temp[2] * cos + temp[6] * sin;
			Value[1, 3] = temp[3] * cos + temp[7] * sin;

			Value[2, 0] = temp[4] * cos - temp[0] * sin;
			Value[2, 1] = temp[5] * cos - temp[1] * sin;
			Value[2, 2] = temp[6] * cos - temp[2] * sin;
			Value[2, 3] = temp[7] * cos - temp[3] * sin;

			return this;
		}

		public Matrix4x4& RotateY( float angle ) {
			float sin = Math.Sin( angle ), cos = Math.Cos( angle );

			float[4] temp1, temp2;
			Memory.Copy( temp1, &Value[0, 0], sizeof( temp1 ) );
			Memory.Copy( temp2, &Value[2, 0], sizeof( temp2 ) );

			Value[0, 0] = temp1[0] * cos - temp2[0] * sin;
			Value[0, 1] = temp1[1] * cos - temp2[1] * sin;
			Value[0, 2] = temp1[2] * cos - temp2[2] * sin;
			Value[0, 3] = temp1[3] * cos - temp2[3] * sin;

			Value[2, 0] = temp1[0] * sin + temp2[0] * cos;
			Value[2, 1] = temp1[1] * sin + temp2[1] * cos;
			Value[2, 2] = temp1[2] * sin + temp2[2] * cos;
			Value[2, 3] = temp1[3] * sin + temp2[3] * cos;

			return this;
		}

		public Matrix4x4& RotateZ( float angle ) {
			float sin = Math.Sin( angle ), cos = Math.Cos( angle );

			float[8] temp;
			Memory.Copy( temp, Value, sizeof( temp ) );

			Value[0, 0] = temp[0] * cos + temp[4] * sin;
			Value[0, 1] = temp[1] * cos + temp[5] * sin;
			Value[0, 2] = temp[2] * cos + temp[6] * sin;
			Value[0, 3] = temp[3] * cos + temp[7] * sin;

			Value[1, 0] = temp[4] * cos - temp[0] * sin;
			Value[1, 1] = temp[5] * cos - temp[1] * sin;
			Value[1, 2] = temp[6] * cos - temp[2] * sin;
			Value[1, 3] = temp[7] * cos - temp[3] * sin;

			return this;
		}

		public Matrix4x4& Rotate( float x, float y, float z ) {
			float A = Math.Cos( x ), B = Math.Sin( x ), C = Math.Cos( y ), D = Math.Sin( y ), E = Math.Cos( z ), F = Math.Sin( z );

			float AC = A * C, AE = A * E, AF = A * F;
			float BC = B * C, BE = B * E, BF = B * F;
			float CE = C * E, CF = C * F;
			float DE = D * E, DF = D * F;

			float ADE = A * DE, ADF = A * DF;
			float BDE = B * DE, BDF = B * DF;

			float BF_ADE = BF - ADE;
			float BE_ADF = BE + ADF;
			float AF_BDE = AF + BDE;
			float AE_BDF = AE - BDF;

			float[12] temp;
			temp[0] = CE * Value[0, 0] + AF_BDE * Value[1, 0] + BF_ADE * Value[2, 0];
			temp[1] = CE * Value[0, 1] + AF_BDE * Value[1, 1] + BF_ADE * Value[2, 1];
			temp[2] = CE * Value[0, 2] + AF_BDE * Value[1, 2] + BF_ADE * Value[2, 2];
			temp[3] = CE * Value[0, 3] + AF_BDE * Value[1, 3] + BF_ADE * Value[2, 3];

			temp[4] = -CF * Value[0, 0] + AE_BDF * Value[1, 0] + BE_ADF * Value[2, 0];
			temp[5] = -CF * Value[0, 1] + AE_BDF * Value[1, 1] + BE_ADF * Value[2, 1];
			temp[6] = -CF * Value[0, 2] + AE_BDF * Value[1, 2] + BE_ADF * Value[2, 2];
			temp[7] = -CF * Value[0, 3] + AE_BDF * Value[1, 3] + BE_ADF * Value[2, 3];

			temp[8] = D * Value[0, 0] - BC * Value[1, 0] + AC * Value[2, 0];
			temp[9] = D * Value[0, 1] - BC * Value[1, 1] + AC * Value[2, 1];
			temp[10] = D * Value[0, 2] - BC * Value[1, 2] + AC * Value[2, 2];
			temp[11] = D * Value[0, 3] - BC * Value[1, 3] + AC * Value[2, 3];

			Memory.Copy( Value, temp, sizeof( temp ) );
			return this;
		}

		[CallingConvention( Value = DefaultCallingConvention )]
		public static Matrix4x4 operator *( Matrix4x4& leftMatrix, Matrix4x4& rightMatrix ) asm {
			X86_32 {
				feature( SSE2 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					prefetchnta [ecx]

					movaps xmm7, [edx]

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					pshufd xmm4, xmm7, 0b00000000
					pshufd xmm5, xmm7, 0b01010101
					pshufd xmm6, xmm7, 0b10101010
					pshufd xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm7, [edx + 16]
					movaps [eax], xmm4

					pshufd xmm4, xmm7, 0b00000000
					pshufd xmm5, xmm7, 0b01010101
					pshufd xmm6, xmm7, 0b10101010
					pshufd xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm7, [edx + 32]
					movaps [eax + 16], xmm4

					pshufd xmm4, xmm7, 0b00000000
					pshufd xmm5, xmm7, 0b01010101
					pshufd xmm6, xmm7, 0b10101010
					pshufd xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm7, [edx + 48]
					movaps [eax + 32], xmm4

					pshufd xmm4, xmm7, 0b00000000
					pshufd xmm5, xmm7, 0b01010101
					pshufd xmm6, xmm7, 0b10101010
					pshufd xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps [eax + 48], xmm4

					retn
				}
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					prefetchnta [ecx]

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					prefetchnta [edx]

					movaps xmm4, [edx]
					movaps xmm5, xmm4
					movaps xmm6, xmm4
					movaps xmm7, xmm4

					shufps xmm4, xmm4, 0b00000000
					shufps xmm5, xmm5, 0b01010101
					shufps xmm6, xmm6, 0b10101010
					shufps xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm5, [edx + 16]

					movaps [eax], xmm4

					movaps xmm4, xmm5
					movaps xmm6, xmm5
					movaps xmm7, xmm5

					shufps xmm4, xmm4, 0b00000000
					shufps xmm5, xmm5, 0b01010101
					shufps xmm6, xmm6, 0b10101010
					shufps xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm5, [edx + 32]

					movaps [eax + 16], xmm4

					movaps xmm4, xmm5
					movaps xmm6, xmm5
					movaps xmm7, xmm5

					shufps xmm4, xmm4, 0b00000000
					shufps xmm5, xmm5, 0b01010101
					shufps xmm6, xmm6, 0b10101010
					shufps xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps xmm5, [edx + 48]

					movaps [eax + 32], xmm4

					movaps xmm4, xmm5
					movaps xmm6, xmm5
					movaps xmm7, xmm5

					shufps xmm4, xmm4, 0b00000000
					shufps xmm5, xmm5, 0b01010101
					shufps xmm6, xmm6, 0b10101010
					shufps xmm7, xmm7, 0b11111111

					mulps xmm4, xmm0
					mulps xmm5, xmm1
					mulps xmm6, xmm2
					mulps xmm7, xmm3

					addps xmm4, xmm5
					addps xmm6, xmm7
					addps xmm4, xmm6

					movaps [eax + 48], xmm4

					retn
				}
				feature( AMD3DNow ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					cmp eax, edx
					jz RightAndResultSame

					push esp
				RightAndResultNotSame:

					movq mm0, [ecx]
					movq mm1, [ecx + 16]
					movq mm2, [ecx + 32]
					movq mm3, [ecx + 48]

					pshufw mm4, [edx], 0b01000100
					pshufw mm5, [edx], 0b11101110
					pshufw mm6, [edx + 8], 0b01000100
					pshufw mm7, [edx + 8], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax], mm4

					pshufw mm4, [edx + 16], 0b01000100
					pshufw mm5, [edx + 16], 0b11101110
					pshufw mm6, [edx + 24], 0b01000100
					pshufw mm7, [edx + 24], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 16], mm4

					pshufw mm4, [edx + 32], 0b01000100
					pshufw mm5, [edx + 32], 0b11101110
					pshufw mm6, [edx + 40], 0b01000100
					pshufw mm7, [edx + 40], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 32], mm4

					pshufw mm4, [edx + 48], 0b01000100
					pshufw mm5, [edx + 48], 0b11101110
					pshufw mm6, [edx + 56], 0b01000100
					pshufw mm7, [edx + 56], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 48], mm4

					movq mm0, [ecx + 8]
					movq mm1, [ecx + 24]
					movq mm2, [ecx + 40]
					movq mm3, [ecx + 56]

					pshufw mm4, [edx], 0b01000100
					pshufw mm5, [edx], 0b11101110
					pshufw mm6, [edx + 8], 0b01000100
					pshufw mm7, [edx + 8], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 8], mm4

					pshufw mm4, [edx + 16], 0b01000100
					pshufw mm5, [edx + 16], 0b11101110
					pshufw mm6, [edx + 24], 0b01000100
					pshufw mm7, [edx + 24], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 24], mm4

					pshufw mm4, [edx + 32], 0b01000100
					pshufw mm5, [edx + 32], 0b11101110
					pshufw mm6, [edx + 40], 0b01000100
					pshufw mm7, [edx + 40], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 40], mm4

					pshufw mm4, [edx + 48], 0b01000100
					pshufw mm5, [edx + 48], 0b11101110
					pshufw mm6, [edx + 56], 0b01000100
					pshufw mm7, [edx + 56], 0b11101110

					pfmul mm4, mm0
					pfmul mm5, mm1

					pfadd mm4, mm5
					pfmul mm6, mm2

					pfadd mm4, mm6
					pfmul mm7, mm3

					pfadd mm4, mm7

					movq [eax + 56], mm4

					femms

					pop esp
					retn

				RightAndResultSame:

					mov edx, esp
					and esp, 0xFFFFFFF0
					sub esp, 64
					push edx
					lea edx, [esp + 4]

					movq mm0, [eax]
					movq mm1, [eax + 8]
					movq mm2, [eax + 16]
					movq mm3, [eax + 24]
					movq mm4, [eax + 32]
					movq mm5, [eax + 40]
					movq mm6, [eax + 48]
					movq mm6, [eax + 56]

					movq [edx], mm0
					movq [edx + 8], mm1
					movq [edx + 16], mm2
					movq [edx + 24], mm3
					movq [edx + 32], mm4
					movq [edx + 40], mm5
					movq [edx + 48], mm6
					movq [edx + 56], mm7

					jmp RightAndResultNotSame
				}
			}
			default {
				Matrix4x4 resultMatrix;

				var& left = leftMatrix.Value;
				var& right = rightMatrix.Value;
				var& result = resultMatrix.Value;

				result[0, 0] = left[0, 0] * right[0, 0] + left[1, 0] * right[0, 1] + left[2, 0] * right[0, 2] + left[3, 0] * right[0, 3];
				result[0, 1] = left[0, 1] * right[0, 0] + left[1, 1] * right[0, 1] + left[2, 1] * right[0, 2] + left[3, 1] * right[0, 3];
				result[0, 2] = left[0, 2] * right[0, 0] + left[1, 2] * right[0, 1] + left[2, 2] * right[0, 2] + left[3, 2] * right[0, 3];
				result[0, 3] = left[0, 3] * right[0, 0] + left[1, 3] * right[0, 1] + left[2, 3] * right[0, 2] + left[3, 3] * right[0, 3];
				result[1, 0] = left[0, 0] * right[1, 0] + left[1, 0] * right[1, 1] + left[2, 0] * right[1, 2] + left[3, 0] * right[1, 3];
				result[1, 1] = left[0, 1] * right[1, 0] + left[1, 1] * right[1, 1] + left[2, 1] * right[1, 2] + left[3, 1] * right[1, 3];
				result[1, 2] = left[0, 2] * right[1, 0] + left[1, 2] * right[1, 1] + left[2, 2] * right[1, 2] + left[3, 2] * right[1, 3];
				result[1, 3] = left[0, 3] * right[1, 0] + left[1, 3] * right[1, 1] + left[2, 3] * right[1, 2] + left[3, 3] * right[1, 3];
				result[2, 0] = left[0, 0] * right[2, 0] + left[1, 0] * right[2, 1] + left[2, 0] * right[2, 2] + left[3, 0] * right[2, 3];
				result[2, 1] = left[0, 1] * right[2, 0] + left[1, 1] * right[2, 1] + left[2, 1] * right[2, 2] + left[3, 1] * right[2, 3];
				result[2, 2] = left[0, 2] * right[2, 0] + left[1, 2] * right[2, 1] + left[2, 2] * right[2, 2] + left[3, 2] * right[2, 3];
				result[2, 3] = left[0, 3] * right[2, 0] + left[1, 3] * right[2, 1] + left[2, 3] * right[2, 2] + left[3, 3] * right[2, 3];
				result[3, 0] = left[0, 0] * right[3, 0] + left[1, 0] * right[3, 1] + left[2, 0] * right[3, 2] + left[3, 0] * right[3, 3];
				result[3, 1] = left[0, 1] * right[3, 0] + left[1, 1] * right[3, 1] + left[2, 1] * right[3, 2] + left[3, 1] * right[3, 3];
				result[3, 2] = left[0, 2] * right[3, 0] + left[1, 2] * right[3, 1] + left[2, 2] * right[3, 2] + left[3, 2] * right[3, 3];
				result[3, 3] = left[0, 3] * right[3, 0] + left[1, 3] * right[3, 1] + left[2, 3] * right[3, 2] + left[3, 3] * right[3, 3];

				return resultMatrix;
			}
		}

		[CallingConvention( Value = DefaultCallingConvention )]
		public static Matrix4x4 operator -( Matrix4x4& leftMatrix, Matrix4x4& rightMatrix ) asm {
			X86_32 {
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					subps xmm0, [edx]
					subps xmm1, [edx + 16]
					subps xmm2, [edx + 32]
					subps xmm3, [edx + 48]

					movaps [eax], xmm0
					movaps [eax + 16], xmm1
					movaps [eax + 32], xmm2
					movaps [eax + 48], xmm3

					retn
				}
				feature( AMD3DNow ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					movq mm0, [ecx]
					movq mm1, [ecx + 8]
					movq mm2, [ecx + 16]
					movq mm3, [ecx + 24]
					movq mm4, [ecx + 32]
					movq mm5, [ecx + 40]
					movq mm6, [ecx + 48]
					movq mm7, [ecx + 56]

					pfsub mm0, [edx]
					pfsub mm1, [edx + 8]
					pfsub mm2, [edx + 16]
					pfsub mm3, [edx + 24]
					pfsub mm4, [edx + 32]
					pfsub mm5, [edx + 40]
					pfsub mm6, [edx + 48]
					pfsub mm7, [edx + 56]

					movq [eax], mm0
					movq [eax + 8], mm1
					movq [eax + 16], mm2
					movq [eax + 24], mm3
					movq [eax + 32], mm4
					movq [eax + 40], mm5
					movq [eax + 48], mm6
					movq [eax + 56], mm7

					femms
					retn
				}
				default {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					fld dword ptr [ecx]
					fsub dword ptr [edx]

					fld dword ptr [ecx + 4]
					fsub dword ptr [edx + 4]

					fld dword ptr [ecx + 8]
					fsub dword ptr [edx + 8]

					fld dword ptr [ecx + 12]
					fsub dword ptr [edx + 12]

					fld dword ptr [ecx + 16]
					fsub dword ptr [edx + 16]

					fld dword ptr [ecx + 20]
					fsub dword ptr [edx + 20]

					fld dword ptr [ecx + 24]
					fsub dword ptr [edx + 24]

					fld dword ptr [ecx + 28]
					fsub dword ptr [edx + 28]

					fstp dword ptr [eax + 28]
					fstp dword ptr [eax + 24]
					fstp dword ptr [eax + 20]
					fstp dword ptr [eax + 16]
					fstp dword ptr [eax + 12]
					fstp dword ptr [eax + 8]
					fstp dword ptr [eax + 4]
					fstp dword ptr [eax]

					fld dword ptr [ecx + 32]
					fsub dword ptr [edx + 32]

					fld dword ptr [ecx + 36]
					fsub dword ptr [edx + 36]

					fld dword ptr [ecx + 40]
					fsub dword ptr [edx + 40]

					fld dword ptr [ecx + 44]
					fsub dword ptr [edx + 44]

					fld dword ptr [ecx + 48]
					fsub dword ptr [edx + 48]

					fld dword ptr [ecx + 52]
					fsub dword ptr [edx + 52]

					fld dword ptr [ecx + 56]
					fsub dword ptr [edx + 56]

					fld dword ptr [ecx + 60]
					fsub dword ptr [edx + 60]

					fstp dword ptr [eax + 60]
					fstp dword ptr [eax + 56]
					fstp dword ptr [eax + 52]
					fstp dword ptr [eax + 48]
					fstp dword ptr [eax + 44]
					fstp dword ptr [eax + 40]
					fstp dword ptr [eax + 36]
					fstp dword ptr [eax + 32]

					retn
				}
			}
			default {
				Matrix4x4 resultMatrix;

				var& left = leftMatrix.Value;
				var& right = rightMatrix.Value;
				var& result = resultMatrix.Value;

				for( int i = 0; i < 4; ++i )
					for( int j = 0; j < 4; ++j )
						result[i, j] = left[i, j] - right[i, j];

				return resultMatrix;
			}
		}

		[CallingConvention( Value = DefaultCallingConvention )]
		public static Matrix4x4 operator +( Matrix4x4& leftMatrix, Matrix4x4& rightMatrix ) asm {
			X86_32 {
				feature( SSE1 ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					movaps xmm0, [ecx]
					movaps xmm1, [ecx + 16]
					movaps xmm2, [ecx + 32]
					movaps xmm3, [ecx + 48]

					addps xmm0, [edx]
					addps xmm1, [edx + 16]
					addps xmm2, [edx + 32]
					addps xmm3, [edx + 48]

					movaps [eax], xmm0
					movaps [eax + 16], xmm1
					movaps [eax + 32], xmm2
					movaps [eax + 48], xmm3

					retn
				}
				feature( AMD3DNow ) {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					movq mm0, [ecx]
					movq mm1, [ecx + 8]
					movq mm2, [ecx + 16]
					movq mm3, [ecx + 24]
					movq mm4, [ecx + 32]
					movq mm5, [ecx + 40]
					movq mm6, [ecx + 48]
					movq mm7, [ecx + 56]

					pfadd mm0, [edx]
					pfadd mm1, [edx + 8]
					pfadd mm2, [edx + 16]
					pfadd mm3, [edx + 24]
					pfadd mm4, [edx + 32]
					pfadd mm5, [edx + 40]
					pfadd mm6, [edx + 48]
					pfadd mm7, [edx + 56]

					movq [eax], mm0
					movq [eax + 8], mm1
					movq [eax + 16], mm2
					movq [eax + 24], mm3
					movq [eax + 32], mm4
					movq [eax + 40], mm5
					movq [eax + 48], mm6
					movq [eax + 56], mm7

					femms
					retn
				}
				default {
					if( thismethod.CallingConvention == CallingConvention.FastCallX86 ) {
						// ecx - leftMatrix
						// edx - rightMatrix
						mov eax, [esp + 4] // result
					}
					else {
						mov ecx, [esp + 4]  // leftMatrix
						mov edx, [esp + 8]  // rightMatrix
						mov eax, [esp + 12] // result
					}

					fld dword ptr [ecx]
					fadd dword ptr [edx]

					fld dword ptr [ecx + 4]
					fadd dword ptr [edx + 4]

					fld dword ptr [ecx + 8]
					fadd dword ptr [edx + 8]

					fld dword ptr [ecx + 12]
					fadd dword ptr [edx + 12]

					fld dword ptr [ecx + 16]
					fadd dword ptr [edx + 16]

					fld dword ptr [ecx + 20]
					fadd dword ptr [edx + 20]

					fld dword ptr [ecx + 24]
					fadd dword ptr [edx + 24]

					fld dword ptr [ecx + 28]
					fadd dword ptr [edx + 28]

					fstp dword ptr [eax + 28]
					fstp dword ptr [eax + 24]
					fstp dword ptr [eax + 20]
					fstp dword ptr [eax + 16]
					fstp dword ptr [eax + 12]
					fstp dword ptr [eax + 8]
					fstp dword ptr [eax + 4]
					fstp dword ptr [eax]

					fld dword ptr [ecx + 32]
					fadd dword ptr [edx + 32]

					fld dword ptr [ecx + 36]
					fadd dword ptr [edx + 36]

					fld dword ptr [ecx + 40]
					fadd dword ptr [edx + 40]

					fld dword ptr [ecx + 44]
					fadd dword ptr [edx + 44]

					fld dword ptr [ecx + 48]
					fadd dword ptr [edx + 48]

					fld dword ptr [ecx + 52]
					fadd dword ptr [edx + 52]

					fld dword ptr [ecx + 56]
					fadd dword ptr [edx + 56]

					fld dword ptr [ecx + 60]
					fadd dword ptr [edx + 60]

					fstp dword ptr [eax + 60]
					fstp dword ptr [eax + 56]
					fstp dword ptr [eax + 52]
					fstp dword ptr [eax + 48]
					fstp dword ptr [eax + 44]
					fstp dword ptr [eax + 40]
					fstp dword ptr [eax + 36]
					fstp dword ptr [eax + 32]

					retn
				}
			}
			default {
				Matrix4x4 resultMatrix;

				var& left = leftMatrix.Value;
				var& right = rightMatrix.Value;
				var& result = resultMatrix.Value;

				for( int i = 0; i < 4; ++i )
					for( int j = 0; j < 4; ++j )
						result[i, j] = left[i, j] + right[i, j];

				return resultMatrix;
			}
		}

		[UnitTest]
		private static void UnitTest() {
			var mat1 = new Matrix4x4( new float[16] {
				1f, 2, 3, 0,
				4f, 5, 6, 0,
				0f, 8, 7, 0,
				2f, 3, 0, 1
			} );

			var equal1 = new Matrix4x4( new float[16] {
				bitcast<float>( bitcast<uint>( 1f ) + 1 ), 2, 3, 0,
				bitcast<float>( bitcast<uint>( 4f ) + 1 ), 5, 6, 0,
				bitcast<float>( bitcast<uint>( 0f ) + 1 ), 8, 7, 0,
				bitcast<float>( bitcast<uint>( 2f ) + 1 ), 3, 0, 1
			} );

			var mat2 = new Matrix4x4( new float[16] {
				-0.48148149f, 0.37037039f, -0.11111111f, 0f,
				  -1.037037f, 0.25925925f,  0.22222222f, 0f,
				  1.1851852f, -0.2962963f, -0.11111111f, 0f,
				  4.0740743f, -1.5185186f, -0.44444445f, 1f
			} );

			var mat3 = new Matrix4x4( new float[16] {
				1, 4, 0, 2,
				2, 5, 8, 3,
				3, 6, 7, 0,
				0, 0, 0, 1
			} );


			Assert.IsTrue( mat1.Equals( equal1, 2 ) );
			Assert.IsTrue( Identity.Determinant == 1f );
			Assert.IsTrue( mat1.Determinant == 27f );
			Assert.IsTrue( mat1.Inversed == mat2 );
			Assert.IsTrue( mat1.Transposed == mat3 );

			var ozdrmat = new Matrix4x4( new float[16] {
				-1,  0, 0, 0,
				 2, -1, 0, 0,
				 3,  2, 1, 0,
				 4,  3, 2, 1
			} );

			var ozdrVector = new Vector4( 2, 0, 1, 0 );
			Assert.IsTrue( new Vector4( 5, 2, 1, 0 ) == ozdrmat.Inversed * ozdrVector );
		}
	}
}