From 64924b954d94338f22d19f3ea1eef701aad64549 Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sat, 25 Feb 2017 07:26:11 +0800
Subject: [PATCH 1/6] [WIP]Add basic SIMD support

- Add an opt-in SIMD support for the module. The feature requires crate
`simd` and specialization, thus can only be enabled under nightly. Under
the given benchmark certain operations were able to be up to 60% faster.
Currently the supported types as well as operations are highly limited.
- Clean up some deadly tests. Also add new tests for SIMD.
---
 Cargo.toml          |   2 +
 src/lib.rs          |   5 +-
 src/macros.rs       | 257 ++++++++++++++++++
 src/matrix.rs       | 203 ++++++++++++--
 src/vector.rs       | 628 +++++++++++++++++++++++++++++++++++++++++++-
 tests/quaternion.rs |   4 +-
 tests/vectorf32.rs  | 267 +++++++++++++++++++
 7 files changed, 1335 insertions(+), 31 deletions(-)
 create mode 100644 tests/vectorf32.rs

diff --git a/Cargo.toml b/Cargo.toml
index 12bd240..f470093 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ name = "cgmath"
 unstable = []
 default = ["rustc-serialize"]
 eders = ["serde", "serde_macros"]
+use_simd = ["simd"]
 
 [dependencies]
 approx = "0.1"
@@ -38,6 +39,7 @@ rand = "0.3"
 rustc-serialize = { version = "0.3", optional = true }
 serde = { version = "0.8", optional = true }
 serde_macros = { version = "0.8", optional = true }
+simd = { version = "0.2", optional = true }
 
 [dev-dependencies]
 glium = "0.15"
diff --git a/src/lib.rs b/src/lib.rs
index 822857b..8941603 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,9 +49,9 @@
 //! ```rust
 //! use cgmath::prelude::*;
 //! ```
-
 #![cfg_attr(feature = "eders", feature(plugin, custom_derive))]
 #![cfg_attr(feature = "eders", plugin(serde_macros))]
+#![cfg_attr(feature = "use_simd", feature(specialization))]
 
 #[macro_use]
 extern crate approx;
@@ -64,6 +64,9 @@ extern crate rustc_serialize;
 #[cfg(feature = "eders")]
 extern crate serde;
 
+#[cfg(feature = "use_simd")]
+extern crate simd;
+
 // Re-exports
 
 pub use approx::*;
diff --git a/src/macros.rs b/src/macros.rs
index d9e54de..ac741e8 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -254,3 +254,260 @@ macro_rules! impl_index_operators {
         }
     }
 }
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_default {
+    // When it is an unary operator
+    (<$S:ident: $Constraint:ident> $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+    };
+    // When the right operand is a scalar
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ident> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+          type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the right operand is a compound type
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<&'a $Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, 'b, $S: $Constraint> $Op<&'a $Rhs> for &'b $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the left operand is a scalar
+    ($Op:ident<$Rhs:ident<$S:ident>> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl $Op<$Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a> $Op<&'a $Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+}
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_assignment_operator_default {
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident(&mut $lhs:ident, $rhs:ident) $body:block
+    }) => {
+        impl<$S: $Constraint + $Op<$S>> $Op<$Rhs> for $Lhs {
+            #[inline]
+            default fn $op(&mut $lhs, $rhs: $Rhs) $body
+        }
+    };
+}
+
+/// Generates a binary operator implementation for the permutations of by-ref and by-val, for simd
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_simd {
+    // When it is an unary operator
+    ([$Simd:ident]; $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op for $Lhs {
+            #[inline]
+            fn $op(self) -> $Output {
+                let $x: $Simd = self.into(); $body
+            }
+        }
+
+        // #[cfg(feature = "simd")]
+        // impl<'a> $Op for &'a $Lhs {
+        //     type Output = $Output;
+        //     #[inline]
+        //     fn $op(self) -> $Output {
+        //         let $x: $Simd = (*self).into(); $body
+        //     }
+        // }
+    };
+    // When the right operand is a scalar
+    (@rs [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), $Simd::splat(other)); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), $Simd::splat(other)); $body
+            }
+        }
+    };
+    // When the right operand is a compound type
+    ([$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), (*other).into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), (*other).into()); $body
+            }
+        }
+    };
+    // When the left operand is a scalar
+    (@ls [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ident {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), (*other).into()); $body
+            }
+        }
+    };
+
+    // // When left is row-vec, right is colume-matrix
+    // (@vm [$Simd: ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+    //     fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty {
+
+    //     }
+    // })
+
+    // When matrix with matrix
+    (@mm $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body: expr }
+    }) => {
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    }
+}
diff --git a/src/matrix.rs b/src/matrix.rs
index 6a1e2fd..ccd3b81 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -615,6 +615,7 @@ impl<S: BaseFloat> Matrix for Matrix4<S> {
     }
 }
 
+//#[cfg(not(feature = "use_simd"))]
 impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     type ColumnRow = Vector4<S>;
 
@@ -672,7 +673,7 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn invert(&self) -> Option<Matrix4<S>> {
-        let det = self.determinant();
+        let det: S = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
             let inv_det = S::one() / det;
             let t = self.transpose();
@@ -731,6 +732,123 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
         ulps_eq!(self[3][2], &self[2][3])
     }
 }
+// #[cfg(feature = "use_simd")]
+// impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
+//     type ColumnRow = Vector4<S>;
+
+//     #[inline]
+//     default fn from_value(value: S) -> Matrix4<S> {
+//         Matrix4::new(value, S::zero(), S::zero(), S::zero(),
+//                      S::zero(), value, S::zero(), S::zero(),
+//                      S::zero(), S::zero(), value, S::zero(),
+//                      S::zero(), S::zero(), S::zero(), value)
+//     }
+
+//     #[inline]
+//     default fn from_diagonal(value: Vector4<S>) -> Matrix4<S> {
+//         Matrix4::new(value.x, S::zero(), S::zero(), S::zero(),
+//                      S::zero(), value.y, S::zero(), S::zero(),
+//                      S::zero(), S::zero(), value.z, S::zero(),
+//                      S::zero(), S::zero(), S::zero(), value.w)
+//     }
+
+//     default fn transpose_self(&mut self) {
+//         self.swap_elements((0, 1), (1, 0));
+//         self.swap_elements((0, 2), (2, 0));
+//         self.swap_elements((0, 3), (3, 0));
+//         self.swap_elements((1, 2), (2, 1));
+//         self.swap_elements((1, 3), (3, 1));
+//         self.swap_elements((2, 3), (3, 2));
+//     }
+
+//     default fn determinant(&self) -> S {
+//         let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
+//                               self[1][2], self[2][2], self[3][2],
+//                               self[1][3], self[2][3], self[3][3]);
+//         let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
+//                               self[0][2], self[2][2], self[3][2],
+//                               self[0][3], self[2][3], self[3][3]);
+//         let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
+//                               self[0][2], self[1][2], self[3][2],
+//                               self[0][3], self[1][3], self[3][3]);
+//         let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
+//                               self[0][2], self[1][2], self[2][2],
+//                               self[0][3], self[1][3], self[2][3]);
+
+//         self[0][0] * m0.determinant() -
+//         self[1][0] * m1.determinant() +
+//         self[2][0] * m2.determinant() -
+//         self[3][0] * m3.determinant()
+//     }
+
+//     #[inline]
+//     default fn diagonal(&self) -> Vector4<S> {
+//         Vector4::new(self[0][0],
+//                      self[1][1],
+//                      self[2][2],
+//                      self[3][3])
+//     }
+
+//     default fn invert(&self) -> Option<Matrix4<S>> {
+//         let det = self.determinant();
+//         if ulps_eq!(det, &S::zero()) { None } else {
+//             let inv_det = S::one() / det;
+//             let t = self.transpose();
+//             let cf = |i, j| {
+//                 let mat = match i {
+//                     0 => Matrix3::from_cols(t.y.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
+//                     1 => Matrix3::from_cols(t.x.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
+//                     2 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.w.truncate_n(j)),
+//                     3 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.z.truncate_n(j)),
+//                     _ => panic!("out of range"),
+//                 };
+//                 let sign = if (i + j) & 1 == 1 { -S::one() } else { S::one() };
+//                 mat.determinant() * sign * inv_det
+//             };
+
+//             Some(Matrix4::new(cf(0, 0), cf(0, 1), cf(0, 2), cf(0, 3),
+//                               cf(1, 0), cf(1, 1), cf(1, 2), cf(1, 3),
+//                               cf(2, 0), cf(2, 1), cf(2, 2), cf(2, 3),
+//                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
+//         }
+//     }
+
+//     default fn is_diagonal(&self) -> bool {
+//         ulps_eq!(self[0][1], &S::zero()) &&
+//         ulps_eq!(self[0][2], &S::zero()) &&
+//         ulps_eq!(self[0][3], &S::zero()) &&
+
+//         ulps_eq!(self[1][0], &S::zero()) &&
+//         ulps_eq!(self[1][2], &S::zero()) &&
+//         ulps_eq!(self[1][3], &S::zero()) &&
+
+//         ulps_eq!(self[2][0], &S::zero()) &&
+//         ulps_eq!(self[2][1], &S::zero()) &&
+//         ulps_eq!(self[2][3], &S::zero()) &&
+
+//         ulps_eq!(self[3][0], &S::zero()) &&
+//         ulps_eq!(self[3][1], &S::zero()) &&
+//         ulps_eq!(self[3][2], &S::zero())
+//     }
+
+//     default fn is_symmetric(&self) -> bool {
+//         ulps_eq!(self[0][1], &self[1][0]) &&
+//         ulps_eq!(self[0][2], &self[2][0]) &&
+//         ulps_eq!(self[0][3], &self[3][0]) &&
+
+//         ulps_eq!(self[1][0], &self[0][1]) &&
+//         ulps_eq!(self[1][2], &self[2][1]) &&
+//         ulps_eq!(self[1][3], &self[3][1]) &&
+
+//         ulps_eq!(self[2][0], &self[0][2]) &&
+//         ulps_eq!(self[2][1], &self[1][2]) &&
+//         ulps_eq!(self[2][3], &self[3][2]) &&
+
+//         ulps_eq!(self[3][0], &self[0][3]) &&
+//         ulps_eq!(self[3][1], &self[1][3]) &&
+//         ulps_eq!(self[3][2], &self[2][3])
+//     }
+// }
 
 impl<S: BaseFloat> ApproxEq for Matrix2<S> {
     type Epsilon = S::Epsilon;
@@ -955,10 +1073,6 @@ macro_rules! impl_matrix {
             fn sub_assign(&mut self, other: $MatrixN<S>) { $(self.$field -= other.$field);+ }
         }
 
-        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
-            fn mul(matrix, vector) -> $VectorN<S> { $VectorN::new($(matrix.row($row_index).dot(vector.clone())),+) }
-        });
-
         impl_scalar_ops!($MatrixN<usize> { $($field),+ });
         impl_scalar_ops!($MatrixN<u8> { $($field),+ });
         impl_scalar_ops!($MatrixN<u16> { $($field),+ });
@@ -1001,6 +1115,25 @@ impl_matrix!(Matrix2, Vector2 { x: 0, y: 1 });
 impl_matrix!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
 impl_matrix!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
 
+macro_rules! impl_mv_operator {
+    ($MatrixN:ident, $VectorN:ident { $($field:ident : $row_index:expr),+ }) => {
+        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
+            fn mul(matrix, vector) -> $VectorN<S> {$VectorN::new($(matrix.row($row_index).dot(vector.clone())),+)}
+        });
+    }
+}
+
+impl_mv_operator!(Matrix2, Vector2 { x: 0, y: 1 });
+impl_mv_operator!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
+#[cfg(not(feature = "use_simd"))]
+impl_mv_operator!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
+#[cfg(feature = "use_simd")]
+impl_operator!(<S: BaseFloat> Mul<Vector4<S> > for Matrix4<S> {
+    fn mul(matrix, vector) -> Vector4<S> {
+        matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
+    }
+});
+
 impl_operator!(<S: BaseFloat> Mul<Matrix2<S> > for Matrix2<S> {
     fn mul(lhs, rhs) -> Matrix2<S> {
         Matrix2::new(lhs.row(0).dot(rhs[0]), lhs.row(1).dot(rhs[0]),
@@ -1020,21 +1153,21 @@ impl_operator!(<S: BaseFloat> Mul<Matrix3<S> > for Matrix3<S> {
 // causes the LLVM to miss identical loads and multiplies. This optimization
 // causes the code to be auto vectorized properly increasing the performance
 // around ~4 times.
-macro_rules! dot_matrix4 {
-    ($A:expr, $B:expr, $I:expr, $J:expr) => {
-        ($A[0][$I]) * ($B[$J][0]) +
-        ($A[1][$I]) * ($B[$J][1]) +
-        ($A[2][$I]) * ($B[$J][2]) +
-        ($A[3][$I]) * ($B[$J][3])
-    };
-}
 
 impl_operator!(<S: BaseFloat> Mul<Matrix4<S> > for Matrix4<S> {
     fn mul(lhs, rhs) -> Matrix4<S> {
-        Matrix4::new(dot_matrix4!(lhs, rhs, 0, 0), dot_matrix4!(lhs, rhs, 1, 0), dot_matrix4!(lhs, rhs, 2, 0), dot_matrix4!(lhs, rhs, 3, 0),
-                     dot_matrix4!(lhs, rhs, 0, 1), dot_matrix4!(lhs, rhs, 1, 1), dot_matrix4!(lhs, rhs, 2, 1), dot_matrix4!(lhs, rhs, 3, 1),
-                     dot_matrix4!(lhs, rhs, 0, 2), dot_matrix4!(lhs, rhs, 1, 2), dot_matrix4!(lhs, rhs, 2, 2), dot_matrix4!(lhs, rhs, 3, 2),
-                     dot_matrix4!(lhs, rhs, 0, 3), dot_matrix4!(lhs, rhs, 1, 3), dot_matrix4!(lhs, rhs, 2, 3), dot_matrix4!(lhs, rhs, 3, 3))
+        {
+            let a = lhs[0];
+            let b = lhs[1];
+            let c = lhs[2];
+            let d = lhs[3];
+            Matrix4::from_cols(
+                a*rhs[0][0] + b*rhs[0][1] + c*rhs[0][2] + d*rhs[0][3],
+                a*rhs[1][0] + b*rhs[1][1] + c*rhs[1][2] + d*rhs[1][3],
+                a*rhs[2][0] + b*rhs[2][1] + c*rhs[2][2] + d*rhs[2][3],
+                a*rhs[3][0] + b*rhs[3][1] + c*rhs[3][2] + d*rhs[3][3],
+            )
+        }
     }
 });
 
@@ -1318,3 +1451,39 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
         Matrix4{ x: rng.gen(), y: rng.gen(), z: rng.gen(), w: rng.gen() }
     }
 }
+
+// Sadly buggy.
+// #[cfg(feature = "use_simd")]
+// impl SquareMatrix for Matrix4<f32> {
+//     fn determinant(&self) -> f32 {
+//         let a = Simdf32x4::new(self.z[1], self.x[1], self.w[1], self.y[1]);
+//         let b = Simdf32x4::new(self.y[2], self.y[2], self.z[2], self.z[2]);
+//         let c = Simdf32x4::new(self.x[3], self.z[3], self.x[3], self.z[3]);
+//         let mut tmp = a * (b * c);
+//         let d = Simdf32x4::new(self.y[1], self.y[1], self.z[1], self.z[1]);
+//         let e = Simdf32x4::new(self.x[2], self.z[2], self.x[2], self.z[2]);
+//         let f = Simdf32x4::new(self.z[3], self.x[3], self.w[3], self.y[3]);
+//         let tmp1 = d * (e * f);
+//         tmp = tmp + tmp1;
+//         let g = Simdf32x4::new(self.x[1], self.z[1], self.x[1], self.z[1]);
+//         let h = Simdf32x4::new(self.z[2], self.x[2], self.w[2], self.y[2]);
+//         let i = Simdf32x4::new(self.y[3], self.y[3], self.z[3], self.z[3]);
+//         let tmp1 = g * (h * i);
+//         tmp = tmp + tmp1;
+//         let tmp1 = g * (b * f);
+//         tmp = tmp - tmp1;
+//         let tmp1 = d * (h * c);
+//         tmp = tmp - tmp1;
+//         let tmp1 = a * (e * i);
+//         tmp = tmp - tmp1;
+//         let tmp: Vector4<f32> = (tmp * Simdf32x4::new(self.x[0], self.y[0], self.z[0], self.w[0])).into();
+//         tmp.sum()
+//     }
+// }
+
+// #[cfg(feature = "use_simd")]
+// impl_operator_simd!(@mm Mul<Vector4<f32>> for Matrix4<f32> {
+//     fn mul(matrix, vector) -> Vector4<f32> {
+//         matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
+//     }
+// });
\ No newline at end of file
diff --git a/src/vector.rs b/src/vector.rs
index dbab97b..8e37c83 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -25,6 +25,13 @@ use angle::Rad;
 use approx::ApproxEq;
 use num::{BaseNum, BaseFloat, PartialOrd};
 
+#[cfg(feature = "use_simd")]
+use simd::f32x4 as Simdf32x4;
+#[cfg(feature = "use_simd")]
+use simd::i32x4 as Simdi32x4;
+#[cfg(feature = "use_simd")]
+use simd::u32x4 as Simdu32x4;
+
 /// A 1-dimensional vector.
 ///
 /// This type is marked as `#[repr(C)]`.
@@ -291,6 +298,217 @@ macro_rules! impl_vector {
     }
 }
 
+// Utility macro for generating associated functions for the vectors
+#[cfg(feature = "use_simd")]
+macro_rules! impl_vector_default {
+    ($VectorN:ident { $($field:ident),+ }, $n:expr, $constructor:ident) => {
+        impl<S> $VectorN<S> {
+            /// Construct a new vector, using the provided values.
+            #[inline]
+            pub fn new($($field: S),+) -> $VectorN<S> {
+                $VectorN { $($field: $field),+ }
+            }
+        }
+
+        /// The short constructor.
+        #[inline]
+        pub fn $constructor<S>($($field: S),+) -> $VectorN<S> {
+            $VectorN::new($($field),+)
+        }
+
+        impl<S: NumCast + Copy> $VectorN<S> {
+            /// Component-wise casting to another type
+            #[inline]
+            pub fn cast<T: NumCast>(&self) -> $VectorN<T> {
+                $VectorN { $($field: NumCast::from(self.$field).unwrap()),+ }
+            }
+        }
+
+        impl<S: BaseFloat> MetricSpace for $VectorN<S> {
+            type Metric = S;
+
+            #[inline]
+            fn distance2(self, other: Self) -> S {
+                (other - self).magnitude2()
+            }
+        }
+
+        impl<S: Copy> Array for $VectorN<S> {
+            type Element = S;
+
+            #[inline]
+            fn from_value(scalar: S) -> $VectorN<S> {
+                $VectorN { $($field: scalar),+ }
+            }
+
+            #[inline]
+            fn sum(self) -> S where S: Add<Output = S> {
+                fold_array!(add, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn product(self) -> S where S: Mul<Output = S> {
+                fold_array!(mul, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn min(self) -> S where S: PartialOrd {
+                fold_array!(partial_min, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn max(self) -> S where S: PartialOrd {
+                fold_array!(partial_max, { $(self.$field),+ })
+            }
+        }
+
+        impl<S: BaseNum> Zero for $VectorN<S> {
+            #[inline]
+            fn zero() -> $VectorN<S> {
+                $VectorN::from_value(S::zero())
+            }
+
+            #[inline]
+            fn is_zero(&self) -> bool {
+                *self == $VectorN::zero()
+            }
+        }
+
+        impl<S: BaseNum> VectorSpace for $VectorN<S> {
+            type Scalar = S;
+        }
+
+        impl<S: Neg<Output = S>> Neg for $VectorN<S> {
+            type Output = $VectorN<S>;
+
+            #[inline]
+            default fn neg(self) -> $VectorN<S> { $VectorN::new($(-self.$field),+) }
+        }
+
+        impl<S: BaseFloat> ApproxEq for $VectorN<S> {
+            type Epsilon = S::Epsilon;
+
+            #[inline]
+            fn default_epsilon() -> S::Epsilon {
+                S::default_epsilon()
+            }
+
+            #[inline]
+            fn default_max_relative() -> S::Epsilon {
+                S::default_max_relative()
+            }
+
+            #[inline]
+            fn default_max_ulps() -> u32 {
+                S::default_max_ulps()
+            }
+
+            #[inline]
+            fn relative_eq(&self, other: &Self, epsilon: S::Epsilon, max_relative: S::Epsilon) -> bool {
+                $(S::relative_eq(&self.$field, &other.$field, epsilon, max_relative))&&+
+            }
+
+            #[inline]
+            fn ulps_eq(&self, other: &Self, epsilon: S::Epsilon, max_ulps: u32) -> bool {
+                $(S::ulps_eq(&self.$field, &other.$field, epsilon, max_ulps))&&+
+            }
+        }
+
+        impl<S: BaseFloat + Rand> Rand for $VectorN<S> {
+            #[inline]
+            fn rand<R: Rng>(rng: &mut R) -> $VectorN<S> {
+                $VectorN { $($field: rng.gen()),+ }
+            }
+        }
+
+        impl_operator_default!(<S: BaseNum> Add<$VectorN<S> > for $VectorN<S> {
+            fn add(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field + rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> AddAssign<$VectorN<S> > for $VectorN<S> {
+            fn add_assign(&mut self, other) { $(self.$field += other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Sub<$VectorN<S> > for $VectorN<S> {
+            fn sub(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field - rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> SubAssign<$VectorN<S> > for $VectorN<S> {
+            fn sub_assign(&mut self, other) { $(self.$field -= other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Mul<S> for $VectorN<S> {
+            fn mul(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field * scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> MulAssign<S> for $VectorN<S> {
+            fn mul_assign(&mut self, scalar) { $(self.$field *= scalar);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Div<S> for $VectorN<S> {
+            fn div(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field / scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> DivAssign<S> for $VectorN<S> {
+            fn div_assign(&mut self, scalar) { $(self.$field /= scalar);+ }
+        });
+
+        impl_operator!(<S: BaseNum> Rem<S> for $VectorN<S> {
+            fn rem(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field % scalar),+) }
+        });
+        impl_assignment_operator!(<S: BaseNum> RemAssign<S> for $VectorN<S> {
+            fn rem_assign(&mut self, scalar) { $(self.$field %= scalar);+ }
+        });
+
+        impl<S: BaseNum> ElementWise for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field + rhs.$field),+) }
+            #[inline] default fn sub_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field - rhs.$field),+) }
+            #[inline] default fn mul_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field * rhs.$field),+) }
+            #[inline] default fn div_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field / rhs.$field),+) }
+            #[inline] fn rem_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field % rhs.$field),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field += rhs.$field);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field -= rhs.$field);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field *= rhs.$field);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field /= rhs.$field);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field %= rhs.$field);+ }
+        }
+
+        impl<S: BaseNum> ElementWise<S> for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field + rhs),+) }
+            #[inline] default fn sub_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field - rhs),+) }
+            #[inline] default fn mul_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field * rhs),+) }
+            #[inline] default fn div_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field / rhs),+) }
+            #[inline] fn rem_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field % rhs),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: S) { $(self.$field += rhs);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: S) { $(self.$field -= rhs);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: S) { $(self.$field *= rhs);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: S) { $(self.$field /= rhs);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: S) { $(self.$field %= rhs);+ }
+        }
+
+        impl_scalar_ops!($VectorN<usize> { $($field),+ });
+        impl_scalar_ops!($VectorN<u8> { $($field),+ });
+        impl_scalar_ops!($VectorN<u16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<u32> { $($field),+ });
+        impl_scalar_ops!($VectorN<u64> { $($field),+ });
+        impl_scalar_ops!($VectorN<isize> { $($field),+ });
+        impl_scalar_ops!($VectorN<i8> { $($field),+ });
+        impl_scalar_ops!($VectorN<i16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<i32> { $($field),+ });
+        impl_scalar_ops!($VectorN<i64> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<f32> { $($field),+ });
+        impl_scalar_ops!($VectorN<f64> { $($field),+ });
+
+        impl_index_operators!($VectorN<S>, $n, S, usize);
+        impl_index_operators!($VectorN<S>, $n, [S], Range<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeTo<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFrom<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFull);
+    }
+}
+
 macro_rules! impl_scalar_ops {
     ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
         impl_operator!(Mul<$VectorN<$S>> for $S {
@@ -305,10 +523,28 @@ macro_rules! impl_scalar_ops {
     };
 }
 
+#[cfg(feature = "use_simd")]
+macro_rules! impl_scalar_ops_default {
+    ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
+        impl_operator_default!(Mul<$VectorN<$S>> for $S {
+            fn mul(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar * vector.$field),+) }
+        });
+        impl_operator_default!(Div<$VectorN<$S>> for $S {
+            fn div(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar / vector.$field),+) }
+        });
+        impl_operator_default!(Rem<$VectorN<$S>> for $S {
+            fn rem(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar % vector.$field),+) }
+        });
+    };
+}
+
 impl_vector!(Vector1 { x }, 1, vec1);
 impl_vector!(Vector2 { x, y }, 2, vec2);
 impl_vector!(Vector3 { x, y, z }, 3, vec3);
+#[cfg(not(feature = "use_simd"))]
 impl_vector!(Vector4 { x, y, z, w }, 4, vec4);
+#[cfg(feature = "use_simd")]
+impl_vector_default!(Vector4 { x, y, z, w }, 4, vec4);
 
 impl_fixed_array_conversions!(Vector1<S> { x: 0 }, 1);
 impl_fixed_array_conversions!(Vector2<S> { x: 0, y: 1 }, 2);
@@ -350,7 +586,7 @@ impl<S: BaseNum> Vector2<S> {
     /// Create a `Vector3`, using the `x` and `y` values from this vector, and the
     /// provided `z`.
     #[inline]
-    pub fn extend(self, z: S)-> Vector3<S> {
+    pub fn extend(self, z: S) -> Vector3<S> {
         Vector3::new(self.x, self.y, z)
     }
 }
@@ -386,13 +622,13 @@ impl<S: BaseNum> Vector3<S> {
     /// Create a `Vector4`, using the `x`, `y` and `z` values from this vector, and the
     /// provided `w`.
     #[inline]
-    pub fn extend(self, w: S)-> Vector4<S> {
+    pub fn extend(self, w: S) -> Vector4<S> {
         Vector4::new(self.x, self.y, self.z, w)
     }
 
     /// Create a `Vector2`, dropping the `z` value.
     #[inline]
-    pub fn truncate(self)-> Vector2<S> {
+    pub fn truncate(self) -> Vector2<S> {
         Vector2::new(self.x, self.y)
     }
 }
@@ -424,27 +660,27 @@ impl<S: BaseNum> Vector4<S> {
 
     /// Create a `Vector3`, dropping the `w` value.
     #[inline]
-    pub fn truncate(self)-> Vector3<S> {
+    pub fn truncate(self) -> Vector3<S> {
         Vector3::new(self.x, self.y, self.z)
     }
 
     /// Create a `Vector3`, dropping the nth element
     #[inline]
-    pub fn truncate_n(&self, n: isize)-> Vector3<S> {
+    pub fn truncate_n(&self, n: isize) -> Vector3<S> {
         match n {
             0 => Vector3::new(self.y, self.z, self.w),
             1 => Vector3::new(self.x, self.z, self.w),
             2 => Vector3::new(self.x, self.y, self.w),
             3 => Vector3::new(self.x, self.y, self.z),
-            _ => panic!("{:?} is out of range", n)
+            _ => panic!("{:?} is out of range", n),
         }
     }
 }
 
 /// Dot product of two vectors.
 #[inline]
-pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar where
-    V::Scalar: BaseFloat,
+pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar
+    where V::Scalar: BaseFloat
 {
     V::dot(a, b)
 }
@@ -515,6 +751,371 @@ impl<S: fmt::Debug> fmt::Debug for Vector4<S> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl From<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn from(f: Simdf32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [f32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Vector4<f32> {
+    /// Compute and return the square root of each element.
+    #[inline]
+    pub fn sqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.sqrt().into()
+    }
+
+    /// Compute and return the reciprocal of the square root of each element.
+    #[inline]
+    pub fn rsqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_rsqrt().into()
+    }
+
+    /// Compute and return the reciprocal of each element.
+    #[inline]
+    pub fn recip_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_reciprocal().into()
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn into(self) -> Simdf32x4 {
+        let self_ref: &[f32; 4] = self.as_ref();
+        Simdf32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Sub<Vector4<f32>> for Vector4<f32> {
+        fn sub(lhs, rhs) -> Vector4<f32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Mul<f32> for Vector4<f32> {
+        fn mul(lhs, rhs) -> Vector4<f32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Div<f32> for Vector4<f32> {
+        fn div(lhs, rhs) -> Vector4<f32> {
+            (lhs / rhs).into()
+        }
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Neg for Vector4<f32> {
+        fn neg(lhs) -> Vector4<f32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<f32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<f32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<f32> for Vector4<f32> {
+    fn mul_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl DivAssign<f32> for Vector4<f32> {
+    fn div_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s / other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self + rhs }
+    #[inline] fn sub_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self - rhs }
+    #[inline] fn mul_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s * rhs).into()
+    }
+    #[inline] fn div_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s / rhs).into()
+    }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) += rhs; }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) -= rhs; }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise<f32> for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s + rhs).into()
+    }
+    #[inline] fn sub_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s - rhs).into()
+    }
+    #[inline] fn mul_element_wise(self, rhs: f32) -> Vector4<f32> { self * rhs }
+    #[inline] fn div_element_wise(self, rhs: f32) -> Vector4<f32> { self / rhs }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s + rhs).into();
+    }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s - rhs).into();
+    }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: f32) { (*self) *= rhs; }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: f32) { (*self) /= rhs; }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn from(f: Simdi32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [i32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn into(self) -> Simdi32x4 {
+        let self_ref: &[i32; 4] = self.as_ref();
+        Simdi32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Add<Vector4<i32>> for Vector4<i32> {
+        fn add(lhs, rhs) -> Vector4<i32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Sub<Vector4<i32>> for Vector4<i32> {
+        fn sub(lhs, rhs) -> Vector4<i32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdi32x4]; Mul<i32> for Vector4<i32> {
+        fn mul(lhs, rhs) -> Vector4<i32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Neg for Vector4<i32> {
+        fn neg(lhs) -> Vector4<i32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<i32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<i32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<i32> for Vector4<i32> {
+    fn mul_assign(&mut self, other: i32) {
+        let s: Simdi32x4 = (*self).into();
+        let other = Simdi32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn from(f: Simdu32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [u32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn into(self) -> Simdu32x4 {
+        let self_ref: &[u32; 4] = self.as_ref();
+        Simdu32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Add<Vector4<u32>> for Vector4<u32> {
+        fn add(lhs, rhs) -> Vector4<u32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Vector4<f32>> for Vector4<f32> {
+        fn add(lhs, rhs) -> Vector4<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Sub<Vector4<u32>> for Vector4<u32> {
+        fn sub(lhs, rhs) -> Vector4<u32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdu32x4]; Mul<u32> for Vector4<u32> {
+        fn mul(lhs, rhs) -> Vector4<u32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<u32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<u32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<u32> for Vector4<u32> {
+    fn mul_assign(&mut self, other: u32) {
+        let s: Simdu32x4 = (*self).into();
+        let other = Simdu32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+
 #[cfg(test)]
 mod tests {
     mod vector2 {
@@ -729,7 +1330,12 @@ mod tests {
     mod vector4 {
         use vector::*;
 
-        const VECTOR4: Vector4<i32> = Vector4 { x: 1, y: 2, z: 3, w: 4 };
+        const VECTOR4: Vector4<i32> = Vector4 {
+            x: 1,
+            y: 2,
+            z: 3,
+            w: 4,
+        };
 
         #[test]
         fn test_index() {
@@ -796,11 +1402,11 @@ mod tests {
         fn test_as_mut() {
             let mut v = VECTOR4;
             {
-                let v: &mut[i32; 4] = v.as_mut();
+                let v: &mut [i32; 4] = v.as_mut();
                 assert_eq!(v, &mut [1, 2, 3, 4]);
             }
             {
-                let v: &mut(i32, i32, i32, i32) = v.as_mut();
+                let v: &mut (i32, i32, i32, i32) = v.as_mut();
                 assert_eq!(v, &mut (1, 2, 3, 4));
             }
         }
diff --git a/tests/quaternion.rs b/tests/quaternion.rs
index f59911d..5b00619 100644
--- a/tests/quaternion.rs
+++ b/tests/quaternion.rs
@@ -194,13 +194,13 @@ mod rotate_from_euler {
         let vec = vec3(0.0, 1.0, 0.0);
 
         let rot = Quaternion::from(Euler::new(Deg(90.0), Deg(90.0), Deg(0.0)));
-        assert_ulps_eq!(vec3(0.0, 0.0, 1.0), rot * vec);
+        assert_ulps_eq!(vec3(0.0f32, 0.0f32, 1.0f32), rot * vec);
     }
 
     // tests that the Z rotation is done after the Y
     #[test]
     fn test_y_then_z() {
-        let vec = vec3(0.0, 0.0, 1.0);
+        let vec = vec3(0.0f32, 0.0f32, 1.0f32);
 
         let rot = Quaternion::from(Euler::new(Deg(0.0), Deg(90.0), Deg(90.0)));
         assert_ulps_eq!(vec3(1.0, 0.0, 0.0), rot * vec);
diff --git a/tests/vectorf32.rs b/tests/vectorf32.rs
new file mode 100644
index 0000000..fd930e8
--- /dev/null
+++ b/tests/vectorf32.rs
@@ -0,0 +1,267 @@
+// Copyright 2013-2014 The CGMath Developers. For a full listing of the authors,
+// refer to the Cargo.toml file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0f32 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0f32
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[macro_use]
+extern crate approx;
+#[macro_use]
+extern crate cgmath;
+
+use cgmath::*;
+use std::f32;
+
+#[test]
+fn test_constructor() {
+    assert_eq!(vec2(1f32, 2f32), Vector2::new(1f32, 2f32));
+    assert_eq!(vec3(1f32, 2f32, 3f32), Vector3::new(1f32, 2f32, 3f32));
+    assert_eq!(vec4(1f32, 2f32, 3f32, 4f32), Vector4::new(1f32, 2f32, 3f32, 4f32));
+}
+
+#[test]
+fn test_from_value() {
+    assert_eq!(Vector2::from_value(102f32), Vector2::new(102f32, 102f32));
+    assert_eq!(Vector3::from_value(22f32), Vector3::new(22f32, 22f32, 22f32));
+    assert_eq!(Vector4::from_value(76.5f32), Vector4::new(76.5f32, 76.5f32, 76.5f32, 76.5f32));
+}
+
+macro_rules! impl_test_add {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector + vector ops
+        assert_eq!($v + $v, $VectorN::new($($v.$field + $v.$field),+));
+        assert_eq!(&$v + &$v, $v + $v);
+        assert_eq!(&$v + $v, $v + $v);
+        assert_eq!($v + &$v, $v + $v);
+    )
+}
+
+macro_rules! impl_test_sub {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector - vector ops
+        assert_eq!($v - $v, $VectorN::new($($v.$field - $v.$field),+));
+        assert_eq!(&$v - &$v, $v - $v);
+        assert_eq!(&$v - $v, $v - $v);
+        assert_eq!($v - &$v, $v - $v);
+    )
+}
+
+macro_rules! impl_test_mul {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector * scalar ops
+        assert_eq!($v * $s, $VectorN::new($($v.$field * $s),+));
+        assert_eq!($s * $v, $VectorN::new($($s * $v.$field),+));
+        assert_eq!(&$v * $s, $v * $s);
+        assert_eq!($s * &$v, $s * $v);
+        // commutativity
+        assert_eq!($v * $s, $s * $v);
+    )
+}
+
+macro_rules! impl_test_div {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector / scalar ops
+        assert_eq!($v / $s, $VectorN::new($($v.$field / $s),+));
+        assert_eq!($s / $v, $VectorN::new($($s / $v.$field),+));
+        assert_eq!(&$v / $s, $v / $s);
+        assert_eq!($s / &$v, $s / $v);
+    )
+}
+
+macro_rules! impl_test_rem {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector % scalar ops
+        assert_eq!($v % $s, $VectorN::new($($v.$field % $s),+));
+        assert_eq!($s % $v, $VectorN::new($($s % $v.$field),+));
+        assert_eq!(&$v % $s, $v % $s);
+        assert_eq!($s % &$v, $s % $v);
+    )
+}
+
+#[test]
+fn test_add() {
+    impl_test_add!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_sub() {
+    impl_test_sub!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_sub!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_sub!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_mul() {
+    impl_test_mul!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_mul!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_mul!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_div() {
+    impl_test_div!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_div!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_div!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_rem() {
+    impl_test_rem!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_rem!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_rem!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_dot() {
+    assert_eq!(Vector2::new(1.0f32, 2.0f32).dot(Vector2::new(3.0f32, 4.0f32)), 11.0f32);
+    assert_eq!(Vector3::new(1.0f32, 2.0f32, 3.0f32).dot(Vector3::new(4.0f32, 5.0f32, 6.0f32)), 32.0f32);
+    assert_eq!(Vector4::new(1.0f32, 2.0f32, 3.0f32, 4.0f32).dot(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32)), 70.0f32);
+}
+
+#[test]
+fn test_sum() {
+    assert_eq!(Vector2::new(1f32, 2f32).sum(), 3f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).sum(), 6f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).sum(), 10f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).sum(), 7.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).sum(), 15.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).sum(), 26.0f32);
+}
+
+#[test]
+fn test_product() {
+    assert_eq!(Vector2::new(1f32, 2f32).product(), 2f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).product(), 6f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).product(), 24f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).product(), 12.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).product(), 120.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).product(), 1680.0f32);
+}
+
+#[test]
+fn test_min() {
+    assert_eq!(Vector2::new(1f32, 2f32).min(), 1f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).min(), 1f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).min(), 1f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).min(), 3.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).min(), 4.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).min(), 5.0f32);
+}
+
+#[test]
+fn test_max() {
+    assert_eq!(Vector2::new(1f32, 2f32).max(), 2f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).max(), 3f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).max(), 4f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).max(), 4.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).max(), 6.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).max(), 8.0f32);
+}
+
+#[test]
+fn test_cross() {
+    let a = Vector3::new(1f32, 2f32, 3f32);
+    let b = Vector3::new(4f32, 5f32, 6f32);
+    let r = Vector3::new(-3f32, 6f32, -3f32);
+    assert_eq!(a.cross(b), r);
+}
+
+#[test]
+fn test_is_perpendicular() {
+    assert!(Vector2::new(1.0f32, 0.0f32).is_perpendicular(Vector2::new(0.0f32, 1.0f32)));
+    assert!(Vector3::new(0.0f32, 1.0f32, 0.0f32).is_perpendicular(Vector3::new(0.0f32, 0.0f32, 1.0f32)));
+    assert!(Vector4::new(1.0f32, 0.0f32, 0.0f32, 0.0f32).is_perpendicular(Vector4::new(0.0f32, 0.0f32, 0.0f32, 1.0f32)));
+}
+
+#[cfg(test)]
+mod test_magnitude {
+    use cgmath::*;
+
+    #[test]
+    fn test_vector2(){
+        let (a, a_res) = (Vector2::new(3.0f32, 4.0f32), 5.0f32); // (3, 4, 5) Pythagorean triple
+        let (b, b_res) = (Vector2::new(5.0f32, 12.0f32), 13.0f32); // (5, 12, 13) Pythagorean triple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+    }
+
+    #[test]
+    fn test_vector3(){
+        let (a, a_res) = (Vector3::new(2.0f32, 3.0f32, 6.0f32), 7.0f32); // (2, 3, 6, 7) Pythagorean quadruple
+        let (b, b_res) = (Vector3::new(1.0f32, 4.0f32, 8.0f32), 9.0f32); // (1, 4, 8, 9) Pythagorean quadruple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+    }
+
+    #[test]
+    fn test_vector4(){
+        let (a, a_res) = (Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32), 11.0f32); // (1, 2, 4, 10, 11) Pythagorean quintuple
+        let (b, b_res) = (Vector4::new(1.0f32, 2.0f32, 8.0f32, 10.0f32), 13.0f32); // (1, 2, 8, 10, 13) Pythagorean quintuple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+
+        #[cfg(feature = "use_simd")]
+        {
+            let a = Vector4::new(1f32, 4f32, 9f32, 16f32);
+            assert_ulps_eq!(a.sqrt_element_wide(), Vector4::new(1f32, 2f32, 3f32, 4f32));
+            assert_relative_eq!(a.sqrt_element_wide().recip_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative = 0.005f32);
+            assert_relative_eq!(a.rsqrt_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative= 0.005f32);
+        }
+        
+    }
+}
+
+#[test]
+fn test_angle() {
+    assert_ulps_eq!(Vector2::new(1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector2::new(10.0f32, 0.0f32).angle(Vector2::new(0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector2::new(-1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &-Rad(f32::consts::FRAC_PI_2));
+
+    assert_ulps_eq!(Vector3::new(1.0f32, 0.0f32, 1.0f32).angle(Vector3::new(1.0f32, 1.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
+    assert_ulps_eq!(Vector3::new(10.0f32, 0.0f32, 10.0f32).angle(Vector3::new(5.0f32, 5.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
+    assert_ulps_eq!(Vector3::new(-1.0f32, 0.0f32, -1.0f32).angle(Vector3::new(1.0f32, -1.0f32, 0.0f32)), &Rad(2.0f32 * f32::consts::FRAC_PI_3));
+
+    assert_ulps_eq!(Vector4::new(1.0f32, 0.0f32, 1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(10.0f32, 0.0f32, 10.0f32, 0.0f32).angle(Vector4::new(0.0f32, 5.0f32, 0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(-1.0f32, 0.0f32, -1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+}
+
+#[test]
+fn test_normalize() {
+    // TODO: test normalize_to, normalize_sel.0f32, and normalize_self_to
+    assert_ulps_eq!(Vector2::new(3.0f32, 4.0f32).normalize(), &Vector2::new(3.0f32/5.0f32, 4.0f32/5.0f32));
+    assert_ulps_eq!(Vector3::new(2.0f32, 3.0f32, 6.0f32).normalize(), &Vector3::new(2.0f32/7.0f32, 3.0f32/7.0f32, 6.0f32/7.0f32));
+    assert_ulps_eq!(Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32).normalize(), &Vector4::new(1.0f32/11.0f32, 2.0f32/11.0f32, 4.0f32/11.0f32, 10.0f32/11.0f32));
+}
+
+#[test]
+fn test_cast() {
+    assert_ulps_eq!(Vector2::new(0.9f32, 1.5).cast(), Vector2::new(0.9f32, 1.5));
+    assert_ulps_eq!(Vector3::new(1.0f32, 2.4, -3.13).cast(), Vector3::new(1.0f32, 2.4, -3.13));
+    assert_ulps_eq!(Vector4::new(13.5f32, -4.6, -8.3, 2.41).cast(), Vector4::new(13.5f32, -4.6, -8.3, 2.41));
+}

From d45536f1bd75dff6978e80af28eaa2091ae764f9 Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sat, 25 Feb 2017 07:42:15 +0800
Subject: [PATCH 2/6] Clean up a bit

---
 src/matrix.rs | 124 ++------------------------------------------------
 src/vector.rs |   1 +
 2 files changed, 5 insertions(+), 120 deletions(-)

diff --git a/src/matrix.rs b/src/matrix.rs
index ccd3b81..dc9f856 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -615,7 +615,7 @@ impl<S: BaseFloat> Matrix for Matrix4<S> {
     }
 }
 
-//#[cfg(not(feature = "use_simd"))]
+
 impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     type ColumnRow = Vector4<S>;
 
@@ -673,7 +673,7 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn invert(&self) -> Option<Matrix4<S>> {
-        let det: S = self.determinant();
+        let det = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
             let inv_det = S::one() / det;
             let t = self.transpose();
@@ -732,123 +732,6 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
         ulps_eq!(self[3][2], &self[2][3])
     }
 }
-// #[cfg(feature = "use_simd")]
-// impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
-//     type ColumnRow = Vector4<S>;
-
-//     #[inline]
-//     default fn from_value(value: S) -> Matrix4<S> {
-//         Matrix4::new(value, S::zero(), S::zero(), S::zero(),
-//                      S::zero(), value, S::zero(), S::zero(),
-//                      S::zero(), S::zero(), value, S::zero(),
-//                      S::zero(), S::zero(), S::zero(), value)
-//     }
-
-//     #[inline]
-//     default fn from_diagonal(value: Vector4<S>) -> Matrix4<S> {
-//         Matrix4::new(value.x, S::zero(), S::zero(), S::zero(),
-//                      S::zero(), value.y, S::zero(), S::zero(),
-//                      S::zero(), S::zero(), value.z, S::zero(),
-//                      S::zero(), S::zero(), S::zero(), value.w)
-//     }
-
-//     default fn transpose_self(&mut self) {
-//         self.swap_elements((0, 1), (1, 0));
-//         self.swap_elements((0, 2), (2, 0));
-//         self.swap_elements((0, 3), (3, 0));
-//         self.swap_elements((1, 2), (2, 1));
-//         self.swap_elements((1, 3), (3, 1));
-//         self.swap_elements((2, 3), (3, 2));
-//     }
-
-//     default fn determinant(&self) -> S {
-//         let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
-//                               self[1][2], self[2][2], self[3][2],
-//                               self[1][3], self[2][3], self[3][3]);
-//         let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
-//                               self[0][2], self[2][2], self[3][2],
-//                               self[0][3], self[2][3], self[3][3]);
-//         let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
-//                               self[0][2], self[1][2], self[3][2],
-//                               self[0][3], self[1][3], self[3][3]);
-//         let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
-//                               self[0][2], self[1][2], self[2][2],
-//                               self[0][3], self[1][3], self[2][3]);
-
-//         self[0][0] * m0.determinant() -
-//         self[1][0] * m1.determinant() +
-//         self[2][0] * m2.determinant() -
-//         self[3][0] * m3.determinant()
-//     }
-
-//     #[inline]
-//     default fn diagonal(&self) -> Vector4<S> {
-//         Vector4::new(self[0][0],
-//                      self[1][1],
-//                      self[2][2],
-//                      self[3][3])
-//     }
-
-//     default fn invert(&self) -> Option<Matrix4<S>> {
-//         let det = self.determinant();
-//         if ulps_eq!(det, &S::zero()) { None } else {
-//             let inv_det = S::one() / det;
-//             let t = self.transpose();
-//             let cf = |i, j| {
-//                 let mat = match i {
-//                     0 => Matrix3::from_cols(t.y.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
-//                     1 => Matrix3::from_cols(t.x.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
-//                     2 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.w.truncate_n(j)),
-//                     3 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.z.truncate_n(j)),
-//                     _ => panic!("out of range"),
-//                 };
-//                 let sign = if (i + j) & 1 == 1 { -S::one() } else { S::one() };
-//                 mat.determinant() * sign * inv_det
-//             };
-
-//             Some(Matrix4::new(cf(0, 0), cf(0, 1), cf(0, 2), cf(0, 3),
-//                               cf(1, 0), cf(1, 1), cf(1, 2), cf(1, 3),
-//                               cf(2, 0), cf(2, 1), cf(2, 2), cf(2, 3),
-//                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
-//         }
-//     }
-
-//     default fn is_diagonal(&self) -> bool {
-//         ulps_eq!(self[0][1], &S::zero()) &&
-//         ulps_eq!(self[0][2], &S::zero()) &&
-//         ulps_eq!(self[0][3], &S::zero()) &&
-
-//         ulps_eq!(self[1][0], &S::zero()) &&
-//         ulps_eq!(self[1][2], &S::zero()) &&
-//         ulps_eq!(self[1][3], &S::zero()) &&
-
-//         ulps_eq!(self[2][0], &S::zero()) &&
-//         ulps_eq!(self[2][1], &S::zero()) &&
-//         ulps_eq!(self[2][3], &S::zero()) &&
-
-//         ulps_eq!(self[3][0], &S::zero()) &&
-//         ulps_eq!(self[3][1], &S::zero()) &&
-//         ulps_eq!(self[3][2], &S::zero())
-//     }
-
-//     default fn is_symmetric(&self) -> bool {
-//         ulps_eq!(self[0][1], &self[1][0]) &&
-//         ulps_eq!(self[0][2], &self[2][0]) &&
-//         ulps_eq!(self[0][3], &self[3][0]) &&
-
-//         ulps_eq!(self[1][0], &self[0][1]) &&
-//         ulps_eq!(self[1][2], &self[2][1]) &&
-//         ulps_eq!(self[1][3], &self[3][1]) &&
-
-//         ulps_eq!(self[2][0], &self[0][2]) &&
-//         ulps_eq!(self[2][1], &self[1][2]) &&
-//         ulps_eq!(self[2][3], &self[3][2]) &&
-
-//         ulps_eq!(self[3][0], &self[0][3]) &&
-//         ulps_eq!(self[3][1], &self[1][3]) &&
-//         ulps_eq!(self[3][2], &self[2][3])
-//     }
-// }
 
 impl<S: BaseFloat> ApproxEq for Matrix2<S> {
     type Epsilon = S::Epsilon;
@@ -1452,7 +1335,8 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
     }
 }
 
-// Sadly buggy.
+// Specialization strangely won't work for this.
+// TODO: find another way
 // #[cfg(feature = "use_simd")]
 // impl SquareMatrix for Matrix4<f32> {
 //     fn determinant(&self) -> f32 {
diff --git a/src/vector.rs b/src/vector.rs
index 8e37c83..f3828a6 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -299,6 +299,7 @@ macro_rules! impl_vector {
 }
 
 // Utility macro for generating associated functions for the vectors
+// mainly duplication
 #[cfg(feature = "use_simd")]
 macro_rules! impl_vector_default {
     ($VectorN:ident { $($field:ident),+ }, $n:expr, $constructor:ident) => {

From 68d1d272221e0f8eb0203b7021d45b0e3f4acf57 Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sat, 25 Feb 2017 20:31:38 +0800
Subject: [PATCH 3/6] Add SIMD support for determinant and inverse

Also:
- Change the travis script: no use `cargo bench`ing in beta or stable.
- Add bench test for Matrix4's determinant().
- clean code a bit
---
 .travis.yml    |   4 +-
 benches/mat.rs |   2 +
 src/macros.rs  |  59 +--------------------------
 src/matrix.rs  | 106 +++++++++++++++++++++++++------------------------
 4 files changed, 61 insertions(+), 110 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 38d6b8d..9768b66 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,6 @@ rust:
   - stable
 
 script:
-  - cargo build && cargo test && cargo bench
+  - cargo build && cargo test
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo bench;
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo build --features "use_simd" && cargo test --features "use_simd" && cargo bench --features "use_simd"
diff --git a/benches/mat.rs b/benches/mat.rs
index f472be2..9d94a72 100644
--- a/benches/mat.rs
+++ b/benches/mat.rs
@@ -59,3 +59,5 @@ bench_unop!(_bench_matrix4_invert, Matrix4<f32>, invert);
 bench_unop!(_bench_matrix2_transpose, Matrix2<f32>, transpose);
 bench_unop!(_bench_matrix3_transpose, Matrix3<f32>, transpose);
 bench_unop!(_bench_matrix4_transpose, Matrix4<f32>, transpose);
+
+bench_unop!(_bench_matrix4_determinant, Matrix4<f32>, determinant);
\ No newline at end of file
diff --git a/src/macros.rs b/src/macros.rs
index ac741e8..e1fa8c6 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -381,21 +381,11 @@ macro_rules! impl_operator_simd {
                 let $x: $Simd = self.into(); $body
             }
         }
-
-        // #[cfg(feature = "simd")]
-        // impl<'a> $Op for &'a $Lhs {
-        //     type Output = $Output;
-        //     #[inline]
-        //     fn $op(self) -> $Output {
-        //         let $x: $Simd = (*self).into(); $body
-        //     }
-        // }
     };
     // When the right operand is a scalar
     (@rs [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
     }) => {
- 
         impl $Op<$Rhs> for $Lhs {
             #[inline]
             fn $op(self, other: $Rhs) -> $Output {
@@ -411,6 +401,7 @@ macro_rules! impl_operator_simd {
             }
         }
     };
+
     // When the right operand is a compound type
     ([$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
@@ -430,7 +421,6 @@ macro_rules! impl_operator_simd {
                 let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), (*other).into()); $body
             }
         }
-
  
         impl<'a> $Op<$Rhs> for &'a $Lhs {
             #[inline]
@@ -439,7 +429,6 @@ macro_rules! impl_operator_simd {
             }
         }
 
- 
         impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
             #[inline]
             fn $op(self, other: &'a $Rhs) -> $Output {
@@ -447,18 +436,17 @@ macro_rules! impl_operator_simd {
             }
         }
     };
+
     // When the left operand is a scalar
     (@ls [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ident {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
     }) => {
- 
         impl $Op<$Rhs> for $Lhs {
             #[inline]
             fn $op(self, other: $Rhs) -> $Output {
                 let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), other.into()); $body
             }
         }
-
  
         impl<'a> $Op<&'a $Rhs> for $Lhs {
             #[inline]
@@ -467,47 +455,4 @@ macro_rules! impl_operator_simd {
             }
         }
     };
-
-    // // When left is row-vec, right is colume-matrix
-    // (@vm [$Simd: ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
-    //     fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty {
-
-    //     }
-    // })
-
-    // When matrix with matrix
-    (@mm $Op:ident<$Rhs:ty> for $Lhs:ty {
-        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body: expr }
-    }) => {
-        impl $Op<$Rhs> for $Lhs {
-            #[inline]
-            fn $op(self, other: $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a> $Op<&'a $Rhs> for $Lhs {
-            #[inline]
-            fn $op(self, other: &'a $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a> $Op<$Rhs> for &'a $Lhs {
-            #[inline]
-            fn $op(self, other: $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
-            #[inline]
-            fn $op(self, other: &'a $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-    }
 }
diff --git a/src/matrix.rs b/src/matrix.rs
index dc9f856..397d2ba 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -645,23 +645,10 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn determinant(&self) -> S {
-        let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
-                              self[1][2], self[2][2], self[3][2],
-                              self[1][3], self[2][3], self[3][3]);
-        let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
-                              self[0][2], self[2][2], self[3][2],
-                              self[0][3], self[2][3], self[3][3]);
-        let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
-                              self[0][2], self[1][2], self[3][2],
-                              self[0][3], self[1][3], self[3][3]);
-        let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
-                              self[0][2], self[1][2], self[2][2],
-                              self[0][3], self[1][3], self[2][3]);
-
-        self[0][0] * m0.determinant() -
-        self[1][0] * m1.determinant() +
-        self[2][0] * m2.determinant() -
-        self[3][0] * m3.determinant()
+        let tmp = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        tmp.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]))
     }
 
     #[inline]
@@ -672,6 +659,12 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                      self[3][3])
     }
 
+    // The new implementation results in negative optimization when used 
+    // without SIMD. so we opt them in with configuration.
+    // A better option would be using specialization. But currently somewhat
+    // specialization is too buggy, and it won't apply here. I'm getting
+    // weird error msgs. Help wanted.
+    #[cfg(not(feature = "use_simd"))]
     fn invert(&self) -> Option<Matrix4<S>> {
         let det = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
@@ -695,6 +688,27 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
         }
     }
+    #[cfg(feature = "use_simd")]
+    fn invert(&self) -> Option<Matrix4<S>> {
+        let tmp0 = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        let det = tmp0.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]));
+        if ulps_eq!(det, &S::zero()) { None } else {
+            let inv_det = S::one() / det;
+            let tmp0 = tmp0 * inv_det;
+            let tmp1 = unsafe {
+                det_sub_proc_unsafe(self, 0, 3, 2) * inv_det
+            };
+            let tmp2 = unsafe {
+                det_sub_proc_unsafe(self, 0, 1, 3) * inv_det
+            };
+            let tmp3 = unsafe {
+                det_sub_proc_unsafe(self, 0, 2, 1) * inv_det
+            };
+            Some(Matrix4::from_cols(tmp0, tmp1, tmp2, tmp3))
+        }
+    }
 
     fn is_diagonal(&self) -> bool {
         ulps_eq!(self[0][1], &S::zero()) &&
@@ -1036,6 +1050,7 @@ impl_operator!(<S: BaseFloat> Mul<Matrix3<S> > for Matrix3<S> {
 // causes the LLVM to miss identical loads and multiplies. This optimization
 // causes the code to be auto vectorized properly increasing the performance
 // around ~4 times.
+// Update: this should now be a bit more efficient
 
 impl_operator!(<S: BaseFloat> Mul<Matrix4<S> > for Matrix4<S> {
     fn mul(lhs, rhs) -> Matrix4<S> {
@@ -1335,39 +1350,26 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
     }
 }
 
-// Specialization strangely won't work for this.
-// TODO: find another way
-// #[cfg(feature = "use_simd")]
-// impl SquareMatrix for Matrix4<f32> {
-//     fn determinant(&self) -> f32 {
-//         let a = Simdf32x4::new(self.z[1], self.x[1], self.w[1], self.y[1]);
-//         let b = Simdf32x4::new(self.y[2], self.y[2], self.z[2], self.z[2]);
-//         let c = Simdf32x4::new(self.x[3], self.z[3], self.x[3], self.z[3]);
-//         let mut tmp = a * (b * c);
-//         let d = Simdf32x4::new(self.y[1], self.y[1], self.z[1], self.z[1]);
-//         let e = Simdf32x4::new(self.x[2], self.z[2], self.x[2], self.z[2]);
-//         let f = Simdf32x4::new(self.z[3], self.x[3], self.w[3], self.y[3]);
-//         let tmp1 = d * (e * f);
-//         tmp = tmp + tmp1;
-//         let g = Simdf32x4::new(self.x[1], self.z[1], self.x[1], self.z[1]);
-//         let h = Simdf32x4::new(self.z[2], self.x[2], self.w[2], self.y[2]);
-//         let i = Simdf32x4::new(self.y[3], self.y[3], self.z[3], self.z[3]);
-//         let tmp1 = g * (h * i);
-//         tmp = tmp + tmp1;
-//         let tmp1 = g * (b * f);
-//         tmp = tmp - tmp1;
-//         let tmp1 = d * (h * c);
-//         tmp = tmp - tmp1;
-//         let tmp1 = a * (e * i);
-//         tmp = tmp - tmp1;
-//         let tmp: Vector4<f32> = (tmp * Simdf32x4::new(self.x[0], self.y[0], self.z[0], self.w[0])).into();
-//         tmp.sum()
-//     }
-// }
+// Sub procedure for SIMD when dealing with determinant and inversion
+#[inline]
+unsafe fn det_sub_proc_unsafe<S: BaseFloat>(m: &Matrix4<S>, x: usize, y: usize, z: usize) -> Vector4<S> {
+    let s: &[S; 16] = m.as_ref();
+    let a = Vector4::new(*s.get_unchecked(4 + x), *s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(8 + x));
+    let b = Vector4::new(*s.get_unchecked(8 + y), *s.get_unchecked(8 + y), *s.get_unchecked(4 + y), *s.get_unchecked(4 + y));
+    let c = Vector4::new(*s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(12 + z), *s.get_unchecked(z));
 
-// #[cfg(feature = "use_simd")]
-// impl_operator_simd!(@mm Mul<Vector4<f32>> for Matrix4<f32> {
-//     fn mul(matrix, vector) -> Vector4<f32> {
-//         matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
-//     }
-// });
\ No newline at end of file
+    let d = Vector4::new(*s.get_unchecked(8 + x), *s.get_unchecked(8 + x), *s.get_unchecked(4 + x), *s.get_unchecked(4 + x));
+    let e = Vector4::new(*s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(12 + y), *s.get_unchecked(y));
+    let f = Vector4::new(*s.get_unchecked(4 + z), *s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(8 + z));
+
+    let g = Vector4::new(*s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(12 + x), *s.get_unchecked(x));
+    let h = Vector4::new(*s.get_unchecked(4 + y), *s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(8 + y));
+    let i = Vector4::new(*s.get_unchecked(8 + z), *s.get_unchecked(8 + z), *s.get_unchecked(4 + z), *s.get_unchecked(4 + z));
+    let mut tmp = a.mul_element_wise(b.mul_element_wise(c));
+    tmp += d.mul_element_wise(e.mul_element_wise(f));
+    tmp += g.mul_element_wise(h.mul_element_wise(i));
+    tmp -= a.mul_element_wise(e.mul_element_wise(i));
+    tmp -= d.mul_element_wise(h.mul_element_wise(c));
+    tmp -= g.mul_element_wise(b.mul_element_wise(f));
+    tmp
+}
\ No newline at end of file

From 17699b7ec35e0112a9923bc922e474226b18469a Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sat, 25 Feb 2017 20:47:43 +0800
Subject: [PATCH 4/6] fix build script

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9768b66..7539fd4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,5 +7,5 @@ rust:
 
 script:
   - cargo build && cargo test
-  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo bench;
-  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo build --features "use_simd" && cargo test --features "use_simd" && cargo bench --features "use_simd"
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo bench; fi
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo build --features "use_simd" && cargo test --features "use_simd" && cargo bench --features "use_simd"; fi

From 09c8727c7c22ff9a8ccb65e71b6719b4a7f6f0df Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sun, 12 Mar 2017 15:59:06 +0800
Subject: [PATCH 5/6] Add newline at end of file

---
 benches/mat.rs                        |  2 +-
 src/matrix.rs                         |  2 +-
 tests/{vectorf32.rs => vector4f32.rs} | 76 ---------------------------
 3 files changed, 2 insertions(+), 78 deletions(-)
 rename tests/{vectorf32.rs => vector4f32.rs} (58%)

diff --git a/benches/mat.rs b/benches/mat.rs
index 9d94a72..88efab9 100644
--- a/benches/mat.rs
+++ b/benches/mat.rs
@@ -60,4 +60,4 @@ bench_unop!(_bench_matrix2_transpose, Matrix2<f32>, transpose);
 bench_unop!(_bench_matrix3_transpose, Matrix3<f32>, transpose);
 bench_unop!(_bench_matrix4_transpose, Matrix4<f32>, transpose);
 
-bench_unop!(_bench_matrix4_determinant, Matrix4<f32>, determinant);
\ No newline at end of file
+bench_unop!(_bench_matrix4_determinant, Matrix4<f32>, determinant);
diff --git a/src/matrix.rs b/src/matrix.rs
index 397d2ba..e900c66 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -1372,4 +1372,4 @@ unsafe fn det_sub_proc_unsafe<S: BaseFloat>(m: &Matrix4<S>, x: usize, y: usize,
     tmp -= d.mul_element_wise(h.mul_element_wise(c));
     tmp -= g.mul_element_wise(b.mul_element_wise(f));
     tmp
-}
\ No newline at end of file
+}
diff --git a/tests/vectorf32.rs b/tests/vector4f32.rs
similarity index 58%
rename from tests/vectorf32.rs
rename to tests/vector4f32.rs
index fd930e8..b860d76 100644
--- a/tests/vectorf32.rs
+++ b/tests/vector4f32.rs
@@ -23,15 +23,11 @@ use std::f32;
 
 #[test]
 fn test_constructor() {
-    assert_eq!(vec2(1f32, 2f32), Vector2::new(1f32, 2f32));
-    assert_eq!(vec3(1f32, 2f32, 3f32), Vector3::new(1f32, 2f32, 3f32));
     assert_eq!(vec4(1f32, 2f32, 3f32, 4f32), Vector4::new(1f32, 2f32, 3f32, 4f32));
 }
 
 #[test]
 fn test_from_value() {
-    assert_eq!(Vector2::from_value(102f32), Vector2::new(102f32, 102f32));
-    assert_eq!(Vector3::from_value(22f32), Vector3::new(22f32, 22f32, 22f32));
     assert_eq!(Vector4::from_value(76.5f32), Vector4::new(76.5f32, 76.5f32, 76.5f32, 76.5f32));
 }
 
@@ -95,94 +91,58 @@ fn test_add() {
 #[test]
 fn test_sub() {
     impl_test_sub!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
-    impl_test_sub!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
-    impl_test_sub!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
 }
 
 #[test]
 fn test_mul() {
     impl_test_mul!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
-    impl_test_mul!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
-    impl_test_mul!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
 }
 
 #[test]
 fn test_div() {
     impl_test_div!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
-    impl_test_div!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
-    impl_test_div!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
 }
 
 #[test]
 fn test_rem() {
     impl_test_rem!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
-    impl_test_rem!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
-    impl_test_rem!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
 }
 
 #[test]
 fn test_dot() {
-    assert_eq!(Vector2::new(1.0f32, 2.0f32).dot(Vector2::new(3.0f32, 4.0f32)), 11.0f32);
-    assert_eq!(Vector3::new(1.0f32, 2.0f32, 3.0f32).dot(Vector3::new(4.0f32, 5.0f32, 6.0f32)), 32.0f32);
     assert_eq!(Vector4::new(1.0f32, 2.0f32, 3.0f32, 4.0f32).dot(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32)), 70.0f32);
 }
 
 #[test]
 fn test_sum() {
-    assert_eq!(Vector2::new(1f32, 2f32).sum(), 3f32);
-    assert_eq!(Vector3::new(1f32, 2f32, 3f32).sum(), 6f32);
     assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).sum(), 10f32);
 
-    assert_eq!(Vector2::new(3.0f32, 4.0f32).sum(), 7.0f32);
-    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).sum(), 15.0f32);
     assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).sum(), 26.0f32);
 }
 
 #[test]
 fn test_product() {
-    assert_eq!(Vector2::new(1f32, 2f32).product(), 2f32);
-    assert_eq!(Vector3::new(1f32, 2f32, 3f32).product(), 6f32);
     assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).product(), 24f32);
 
-    assert_eq!(Vector2::new(3.0f32, 4.0f32).product(), 12.0f32);
-    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).product(), 120.0f32);
     assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).product(), 1680.0f32);
 }
 
 #[test]
 fn test_min() {
-    assert_eq!(Vector2::new(1f32, 2f32).min(), 1f32);
-    assert_eq!(Vector3::new(1f32, 2f32, 3f32).min(), 1f32);
     assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).min(), 1f32);
 
-    assert_eq!(Vector2::new(3.0f32, 4.0f32).min(), 3.0f32);
-    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).min(), 4.0f32);
     assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).min(), 5.0f32);
 }
 
 #[test]
 fn test_max() {
-    assert_eq!(Vector2::new(1f32, 2f32).max(), 2f32);
-    assert_eq!(Vector3::new(1f32, 2f32, 3f32).max(), 3f32);
     assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).max(), 4f32);
 
-    assert_eq!(Vector2::new(3.0f32, 4.0f32).max(), 4.0f32);
-    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).max(), 6.0f32);
     assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).max(), 8.0f32);
 }
 
-#[test]
-fn test_cross() {
-    let a = Vector3::new(1f32, 2f32, 3f32);
-    let b = Vector3::new(4f32, 5f32, 6f32);
-    let r = Vector3::new(-3f32, 6f32, -3f32);
-    assert_eq!(a.cross(b), r);
-}
-
 #[test]
 fn test_is_perpendicular() {
-    assert!(Vector2::new(1.0f32, 0.0f32).is_perpendicular(Vector2::new(0.0f32, 1.0f32)));
-    assert!(Vector3::new(0.0f32, 1.0f32, 0.0f32).is_perpendicular(Vector3::new(0.0f32, 0.0f32, 1.0f32)));
     assert!(Vector4::new(1.0f32, 0.0f32, 0.0f32, 0.0f32).is_perpendicular(Vector4::new(0.0f32, 0.0f32, 0.0f32, 1.0f32)));
 }
 
@@ -190,30 +150,6 @@ fn test_is_perpendicular() {
 mod test_magnitude {
     use cgmath::*;
 
-    #[test]
-    fn test_vector2(){
-        let (a, a_res) = (Vector2::new(3.0f32, 4.0f32), 5.0f32); // (3, 4, 5) Pythagorean triple
-        let (b, b_res) = (Vector2::new(5.0f32, 12.0f32), 13.0f32); // (5, 12, 13) Pythagorean triple
-
-        assert_eq!(a.magnitude2(), a_res * a_res);
-        assert_eq!(b.magnitude2(), b_res * b_res);
-
-        assert_eq!(a.magnitude(), a_res);
-        assert_eq!(b.magnitude(), b_res);
-    }
-
-    #[test]
-    fn test_vector3(){
-        let (a, a_res) = (Vector3::new(2.0f32, 3.0f32, 6.0f32), 7.0f32); // (2, 3, 6, 7) Pythagorean quadruple
-        let (b, b_res) = (Vector3::new(1.0f32, 4.0f32, 8.0f32), 9.0f32); // (1, 4, 8, 9) Pythagorean quadruple
-
-        assert_eq!(a.magnitude2(), a_res * a_res);
-        assert_eq!(b.magnitude2(), b_res * b_res);
-
-        assert_eq!(a.magnitude(), a_res);
-        assert_eq!(b.magnitude(), b_res);
-    }
-
     #[test]
     fn test_vector4(){
         let (a, a_res) = (Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32), 11.0f32); // (1, 2, 4, 10, 11) Pythagorean quintuple
@@ -238,14 +174,6 @@ mod test_magnitude {
 
 #[test]
 fn test_angle() {
-    assert_ulps_eq!(Vector2::new(1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
-    assert_ulps_eq!(Vector2::new(10.0f32, 0.0f32).angle(Vector2::new(0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
-    assert_ulps_eq!(Vector2::new(-1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &-Rad(f32::consts::FRAC_PI_2));
-
-    assert_ulps_eq!(Vector3::new(1.0f32, 0.0f32, 1.0f32).angle(Vector3::new(1.0f32, 1.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
-    assert_ulps_eq!(Vector3::new(10.0f32, 0.0f32, 10.0f32).angle(Vector3::new(5.0f32, 5.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
-    assert_ulps_eq!(Vector3::new(-1.0f32, 0.0f32, -1.0f32).angle(Vector3::new(1.0f32, -1.0f32, 0.0f32)), &Rad(2.0f32 * f32::consts::FRAC_PI_3));
-
     assert_ulps_eq!(Vector4::new(1.0f32, 0.0f32, 1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
     assert_ulps_eq!(Vector4::new(10.0f32, 0.0f32, 10.0f32, 0.0f32).angle(Vector4::new(0.0f32, 5.0f32, 0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
     assert_ulps_eq!(Vector4::new(-1.0f32, 0.0f32, -1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
@@ -254,14 +182,10 @@ fn test_angle() {
 #[test]
 fn test_normalize() {
     // TODO: test normalize_to, normalize_sel.0f32, and normalize_self_to
-    assert_ulps_eq!(Vector2::new(3.0f32, 4.0f32).normalize(), &Vector2::new(3.0f32/5.0f32, 4.0f32/5.0f32));
-    assert_ulps_eq!(Vector3::new(2.0f32, 3.0f32, 6.0f32).normalize(), &Vector3::new(2.0f32/7.0f32, 3.0f32/7.0f32, 6.0f32/7.0f32));
     assert_ulps_eq!(Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32).normalize(), &Vector4::new(1.0f32/11.0f32, 2.0f32/11.0f32, 4.0f32/11.0f32, 10.0f32/11.0f32));
 }
 
 #[test]
 fn test_cast() {
-    assert_ulps_eq!(Vector2::new(0.9f32, 1.5).cast(), Vector2::new(0.9f32, 1.5));
-    assert_ulps_eq!(Vector3::new(1.0f32, 2.4, -3.13).cast(), Vector3::new(1.0f32, 2.4, -3.13));
     assert_ulps_eq!(Vector4::new(13.5f32, -4.6, -8.3, 2.41).cast(), Vector4::new(13.5f32, -4.6, -8.3, 2.41));
 }

From 194c4770e968ed4c10fc326c6d9ec541b42a901a Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sun, 12 Mar 2017 20:44:58 +0800
Subject: [PATCH 6/6] add basic SIMD support for Quaternion<f32>

---
 src/quaternion.rs | 234 +++++++++++++++++++++++++++++++++++++++++++++-
 src/vector.rs     |  18 ++--
 2 files changed, 240 insertions(+), 12 deletions(-)

diff --git a/src/quaternion.rs b/src/quaternion.rs
index c97aea6..384bf1e 100644
--- a/src/quaternion.rs
+++ b/src/quaternion.rs
@@ -30,6 +30,8 @@ use point::Point3;
 use rotation::{Rotation, Rotation3, Basis3};
 use vector::Vector3;
 
+#[cfg(feature = "use_simd")]
+use simd::f32x4 as Simdf32x4;
 
 /// A [quaternion](https://en.wikipedia.org/wiki/Quaternion) in scalar/vector
 /// form.
@@ -46,6 +48,30 @@ pub struct Quaternion<S> {
     pub v: Vector3<S>,
 }
 
+#[cfg(feature = "use_simd")]
+impl From<Simdf32x4> for Quaternion<f32> {
+    #[inline]
+    fn from(f: Simdf32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [f32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdf32x4> for Quaternion<f32> {
+    #[inline]
+    fn into(self) -> Simdf32x4 {
+        let self_ref: &[f32; 4] = self.as_ref();
+        Simdf32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
 impl<S: BaseFloat> Quaternion<S> {
     /// Construct a new quaternion from one scalar component and three
     /// imaginary components
@@ -73,7 +99,7 @@ impl<S: BaseFloat> Quaternion<S> {
         let mag_avg = (src.magnitude2() * dst.magnitude2()).sqrt();
         let dot = src.dot(dst);
         if ulps_eq!(dot, &mag_avg) {
-            Quaternion::one()
+            Quaternion::<S>::one()
         } else if ulps_eq!(dot, &-mag_avg) {
             let axis = fallback.unwrap_or_else(|| {
                 let mut v = Vector3::unit_x().cross(src);
@@ -151,7 +177,7 @@ impl<S: BaseFloat> Zero for Quaternion<S> {
 
     #[inline]
     fn is_zero(&self) -> bool {
-        ulps_eq!(self, &Quaternion::zero())
+        ulps_eq!(self, &Quaternion::<S>::zero())
     }
 }
 
@@ -175,6 +201,7 @@ impl<S: BaseFloat> MetricSpace for Quaternion<S> {
     }
 }
 
+#[cfg(not(feature = "use_simd"))]
 impl<S: BaseFloat> InnerSpace for Quaternion<S> {
     #[inline]
     fn dot(self, other: Quaternion<S>) -> S {
@@ -182,6 +209,25 @@ impl<S: BaseFloat> InnerSpace for Quaternion<S> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl<S: BaseFloat> InnerSpace for Quaternion<S> {
+    #[inline]
+    default fn dot(self, other: Quaternion<S>) -> S {
+        self.s * other.s + self.v.dot(other.v)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl InnerSpace for Quaternion<f32> {
+    #[inline]
+    fn dot(self, other: Quaternion<f32>) -> f32 {
+        let lhs: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = other.into();
+        let r = lhs * rhs;
+        r.extract(0) + r.extract(1) + r.extract(2) + r.extract(3)
+    }
+}
+
 impl<A> From<Euler<A>> for Quaternion<<A as Angle>::Unitless> where
     A: Angle + Into<Rad<<A as Angle>::Unitless>>,
 {
@@ -203,35 +249,119 @@ impl<A> From<Euler<A>> for Quaternion<<A as Angle>::Unitless> where
     }
 }
 
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Neg for Quaternion<S> {
     fn neg(quat) -> Quaternion<S> {
         Quaternion::from_sv(-quat.s, -quat.v)
     }
 });
 
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Neg for Quaternion<S> {
+    fn neg(quat) -> Quaternion<S> {
+        Quaternion::from_sv(-quat.s, -quat.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Neg for Quaternion<f32> {
+        fn neg(lhs) -> Quaternion<f32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Mul<S> for Quaternion<S> {
     fn mul(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s * rhs, lhs.v * rhs)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Mul<S> for Quaternion<S> {
+    fn mul(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s * rhs, lhs.v * rhs)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Mul<f32> for Quaternion<f32> {
+        fn mul(lhs, rhs) -> Quaternion<f32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> MulAssign<S> for Quaternion<S> {
     fn mul_assign(&mut self, scalar) { self.s *= scalar; self.v *= scalar; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> MulAssign<S> for Quaternion<S> {
+    fn mul_assign(&mut self, scalar) { self.s *= scalar; self.v *= scalar; }
+});
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<f32> for Quaternion<f32> {
+    fn mul_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Div<S> for Quaternion<S> {
     fn div(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s / rhs, lhs.v / rhs)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Div<S> for Quaternion<S> {
+    fn div(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s / rhs, lhs.v / rhs)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Div<f32> for Quaternion<f32> {
+        fn div(lhs, rhs) -> Quaternion<f32> {
+            (lhs / rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> DivAssign<S> for Quaternion<S> {
     fn div_assign(&mut self, scalar) { self.s /= scalar; self.v /= scalar; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> DivAssign<S> for Quaternion<S> {
+    fn div_assign(&mut self, scalar) { self.s /= scalar; self.v /= scalar; }
+});
+
+#[cfg(feature = "use_simd")]
+impl DivAssign<f32> for Quaternion<f32> {
+    fn div_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s / other).into();
+    }
+}
+
 impl_operator!(<S: BaseFloat> Rem<S> for Quaternion<S> {
     fn rem(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s % rhs, lhs.v % rhs)
     }
 });
+
 impl_assignment_operator!(<S: BaseFloat> RemAssign<S> for Quaternion<S> {
     fn rem_assign(&mut self, scalar) { self.s %= scalar; self.v %= scalar; }
 });
@@ -245,24 +375,93 @@ impl_operator!(<S: BaseFloat> Mul<Vector3<S> > for Quaternion<S> {
     }}
 });
 
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Add<Quaternion<S> > for Quaternion<S> {
     fn add(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s + rhs.s, lhs.v + rhs.v)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Add<Quaternion<S> > for Quaternion<S> {
+    fn add(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s + rhs.s, lhs.v + rhs.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Quaternion<f32>> for Quaternion<f32> {
+        fn add(lhs, rhs) -> Quaternion<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> AddAssign<Quaternion<S> > for Quaternion<S> {
     fn add_assign(&mut self, other) { self.s += other.s; self.v += other.v; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> AddAssign<Quaternion<S> > for Quaternion<S> {
+    fn add_assign(&mut self, other) { self.s += other.s; self.v += other.v; }
+});
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Quaternion<f32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Sub<Quaternion<S> > for Quaternion<S> {
     fn sub(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s - rhs.s, lhs.v - rhs.v)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Sub<Quaternion<S> > for Quaternion<S> {
+    fn sub(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s - rhs.s, lhs.v - rhs.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Sub<Quaternion<f32>> for Quaternion<f32> {
+        fn sub(lhs, rhs) -> Quaternion<f32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> SubAssign<Quaternion<S> > for Quaternion<S> {
     fn sub_assign(&mut self, other) { self.s -= other.s; self.v -= other.v; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> SubAssign<Quaternion<S> > for Quaternion<S> {
+    fn sub_assign(&mut self, other) { self.s -= other.s; self.v -= other.v; }
+});
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Quaternion<f32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
     fn mul(lhs, rhs) -> Quaternion<S> {
         Quaternion::new(lhs.s * rhs.s - lhs.v.x * rhs.v.x - lhs.v.y * rhs.v.y - lhs.v.z * rhs.v.z,
@@ -272,6 +471,37 @@ impl_operator!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
     }
 });
 
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
+    fn mul(lhs, rhs) -> Quaternion<S> {
+        Quaternion::new(lhs.s * rhs.s - lhs.v.x * rhs.v.x - lhs.v.y * rhs.v.y - lhs.v.z * rhs.v.z,
+                        lhs.s * rhs.v.x + lhs.v.x * rhs.s + lhs.v.y * rhs.v.z - lhs.v.z * rhs.v.y,
+                        lhs.s * rhs.v.y + lhs.v.y * rhs.s + lhs.v.z * rhs.v.x - lhs.v.x * rhs.v.z,
+                        lhs.s * rhs.v.z + lhs.v.z * rhs.s + lhs.v.x * rhs.v.y - lhs.v.y * rhs.v.x)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Mul<Quaternion<f32>> for Quaternion<f32> {
+        fn mul(lhs, rhs) -> Quaternion<f32> {
+            {
+                let p0 = Simdf32x4::splat(lhs.extract(0)) * rhs;
+                let p1 = Simdf32x4::splat(lhs.extract(1)) * Simdf32x4::new(
+                    -rhs.extract(1), rhs.extract(0), -rhs.extract(3), rhs.extract(2)
+                );
+                let p2 = Simdf32x4::splat(lhs.extract(2)) * Simdf32x4::new(
+                    -rhs.extract(2), rhs.extract(3), rhs.extract(0), -rhs.extract(1)
+                );
+                let p3 = Simdf32x4::splat(lhs.extract(3)) * Simdf32x4::new(
+                    -rhs.extract(3), -rhs.extract(2), rhs.extract(1), rhs.extract(0)
+                );
+                (p0 + p1 + p2 + p3).into()
+            }
+        }
+    }
+}
+
 macro_rules! impl_scalar_mul {
     ($S:ident) => {
         impl_operator!(Mul<Quaternion<$S>> for $S {
diff --git a/src/vector.rs b/src/vector.rs
index f3828a6..a289614 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -802,6 +802,14 @@ impl Into<Simdf32x4> for Vector4<f32> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Vector4<f32>> for Vector4<f32> {
+        fn add(lhs, rhs) -> Vector4<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
 
 #[cfg(feature = "use_simd")]
 impl_operator_simd!{
@@ -1050,7 +1058,6 @@ impl Into<Simdu32x4> for Vector4<u32> {
     }
 }
 
-
 #[cfg(feature = "use_simd")]
 impl_operator_simd!{
     [Simdu32x4]; Add<Vector4<u32>> for Vector4<u32> {
@@ -1060,15 +1067,6 @@ impl_operator_simd!{
     }
 }
 
-#[cfg(feature = "use_simd")]
-impl_operator_simd!{
-    [Simdf32x4]; Add<Vector4<f32>> for Vector4<f32> {
-        fn add(lhs, rhs) -> Vector4<f32> {
-            (lhs + rhs).into()
-        }
-    }
-}
-
 #[cfg(feature = "use_simd")]
 impl_operator_simd!{
     [Simdu32x4]; Sub<Vector4<u32>> for Vector4<u32> {