From 64924b954d94338f22d19f3ea1eef701aad64549 Mon Sep 17 00:00:00 2001
From: Luxko <liarluxlux@gmail.com>
Date: Sat, 25 Feb 2017 07:26:11 +0800
Subject: [PATCH] [WIP]Add basic SIMD support

- Add an opt-in SIMD support for the module. The feature requires crate
`simd` and specialization, thus can only be enabled under nightly. Under
the given benchmark certain operations were able to be up to 60% faster.
Currently the supported types as well as operations are highly limited.
- Clean up some deadly tests. Also add new tests for SIMD.
---
 Cargo.toml          |   2 +
 src/lib.rs          |   5 +-
 src/macros.rs       | 257 ++++++++++++++++++
 src/matrix.rs       | 203 ++++++++++++--
 src/vector.rs       | 628 +++++++++++++++++++++++++++++++++++++++++++-
 tests/quaternion.rs |   4 +-
 tests/vectorf32.rs  | 267 +++++++++++++++++++
 7 files changed, 1335 insertions(+), 31 deletions(-)
 create mode 100644 tests/vectorf32.rs

diff --git a/Cargo.toml b/Cargo.toml
index 12bd240..f470093 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ name = "cgmath"
 unstable = []
 default = ["rustc-serialize"]
 eders = ["serde", "serde_macros"]
+use_simd = ["simd"]
 
 [dependencies]
 approx = "0.1"
@@ -38,6 +39,7 @@ rand = "0.3"
 rustc-serialize = { version = "0.3", optional = true }
 serde = { version = "0.8", optional = true }
 serde_macros = { version = "0.8", optional = true }
+simd = { version = "0.2", optional = true }
 
 [dev-dependencies]
 glium = "0.15"
diff --git a/src/lib.rs b/src/lib.rs
index 822857b..8941603 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,9 +49,9 @@
 //! ```rust
 //! use cgmath::prelude::*;
 //! ```
-
 #![cfg_attr(feature = "eders", feature(plugin, custom_derive))]
 #![cfg_attr(feature = "eders", plugin(serde_macros))]
+#![cfg_attr(feature = "use_simd", feature(specialization))]
 
 #[macro_use]
 extern crate approx;
@@ -64,6 +64,9 @@ extern crate rustc_serialize;
 #[cfg(feature = "eders")]
 extern crate serde;
 
+#[cfg(feature = "use_simd")]
+extern crate simd;
+
 // Re-exports
 
 pub use approx::*;
diff --git a/src/macros.rs b/src/macros.rs
index d9e54de..ac741e8 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -254,3 +254,260 @@ macro_rules! impl_index_operators {
         }
     }
 }
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_default {
+    // When it is an unary operator
+    (<$S:ident: $Constraint:ident> $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+    };
+    // When the right operand is a scalar
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ident> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+          type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the right operand is a compound type
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<&'a $Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, 'b, $S: $Constraint> $Op<&'a $Rhs> for &'b $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the left operand is a scalar
+    ($Op:ident<$Rhs:ident<$S:ident>> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl $Op<$Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a> $Op<&'a $Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+}
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_assignment_operator_default {
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident(&mut $lhs:ident, $rhs:ident) $body:block
+    }) => {
+        impl<$S: $Constraint + $Op<$S>> $Op<$Rhs> for $Lhs {
+            #[inline]
+            default fn $op(&mut $lhs, $rhs: $Rhs) $body
+        }
+    };
+}
+
+/// Generates a binary operator implementation for the permutations of by-ref and by-val, for simd
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_simd {
+    // When it is an unary operator
+    ([$Simd:ident]; $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op for $Lhs {
+            #[inline]
+            fn $op(self) -> $Output {
+                let $x: $Simd = self.into(); $body
+            }
+        }
+
+        // #[cfg(feature = "simd")]
+        // impl<'a> $Op for &'a $Lhs {
+        //     type Output = $Output;
+        //     #[inline]
+        //     fn $op(self) -> $Output {
+        //         let $x: $Simd = (*self).into(); $body
+        //     }
+        // }
+    };
+    // When the right operand is a scalar
+    (@rs [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), $Simd::splat(other)); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), $Simd::splat(other)); $body
+            }
+        }
+    };
+    // When the right operand is a compound type
+    ([$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), (*other).into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), (*other).into()); $body
+            }
+        }
+    };
+    // When the left operand is a scalar
+    (@ls [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ident {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), (*other).into()); $body
+            }
+        }
+    };
+
+    // // When left is row-vec, right is colume-matrix
+    // (@vm [$Simd: ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+    //     fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty {
+
+    //     }
+    // })
+
+    // When matrix with matrix
+    (@mm $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body: expr }
+    }) => {
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+ 
+        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    }
+}
diff --git a/src/matrix.rs b/src/matrix.rs
index 6a1e2fd..ccd3b81 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -615,6 +615,7 @@ impl<S: BaseFloat> Matrix for Matrix4<S> {
     }
 }
 
+//#[cfg(not(feature = "use_simd"))]
 impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     type ColumnRow = Vector4<S>;
 
@@ -672,7 +673,7 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn invert(&self) -> Option<Matrix4<S>> {
-        let det = self.determinant();
+        let det: S = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
             let inv_det = S::one() / det;
             let t = self.transpose();
@@ -731,6 +732,123 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
         ulps_eq!(self[3][2], &self[2][3])
     }
 }
+// #[cfg(feature = "use_simd")]
+// impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
+//     type ColumnRow = Vector4<S>;
+
+//     #[inline]
+//     default fn from_value(value: S) -> Matrix4<S> {
+//         Matrix4::new(value, S::zero(), S::zero(), S::zero(),
+//                      S::zero(), value, S::zero(), S::zero(),
+//                      S::zero(), S::zero(), value, S::zero(),
+//                      S::zero(), S::zero(), S::zero(), value)
+//     }
+
+//     #[inline]
+//     default fn from_diagonal(value: Vector4<S>) -> Matrix4<S> {
+//         Matrix4::new(value.x, S::zero(), S::zero(), S::zero(),
+//                      S::zero(), value.y, S::zero(), S::zero(),
+//                      S::zero(), S::zero(), value.z, S::zero(),
+//                      S::zero(), S::zero(), S::zero(), value.w)
+//     }
+
+//     default fn transpose_self(&mut self) {
+//         self.swap_elements((0, 1), (1, 0));
+//         self.swap_elements((0, 2), (2, 0));
+//         self.swap_elements((0, 3), (3, 0));
+//         self.swap_elements((1, 2), (2, 1));
+//         self.swap_elements((1, 3), (3, 1));
+//         self.swap_elements((2, 3), (3, 2));
+//     }
+
+//     default fn determinant(&self) -> S {
+//         let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
+//                               self[1][2], self[2][2], self[3][2],
+//                               self[1][3], self[2][3], self[3][3]);
+//         let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
+//                               self[0][2], self[2][2], self[3][2],
+//                               self[0][3], self[2][3], self[3][3]);
+//         let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
+//                               self[0][2], self[1][2], self[3][2],
+//                               self[0][3], self[1][3], self[3][3]);
+//         let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
+//                               self[0][2], self[1][2], self[2][2],
+//                               self[0][3], self[1][3], self[2][3]);
+
+//         self[0][0] * m0.determinant() -
+//         self[1][0] * m1.determinant() +
+//         self[2][0] * m2.determinant() -
+//         self[3][0] * m3.determinant()
+//     }
+
+//     #[inline]
+//     default fn diagonal(&self) -> Vector4<S> {
+//         Vector4::new(self[0][0],
+//                      self[1][1],
+//                      self[2][2],
+//                      self[3][3])
+//     }
+
+//     default fn invert(&self) -> Option<Matrix4<S>> {
+//         let det = self.determinant();
+//         if ulps_eq!(det, &S::zero()) { None } else {
+//             let inv_det = S::one() / det;
+//             let t = self.transpose();
+//             let cf = |i, j| {
+//                 let mat = match i {
+//                     0 => Matrix3::from_cols(t.y.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
+//                     1 => Matrix3::from_cols(t.x.truncate_n(j), t.z.truncate_n(j), t.w.truncate_n(j)),
+//                     2 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.w.truncate_n(j)),
+//                     3 => Matrix3::from_cols(t.x.truncate_n(j), t.y.truncate_n(j), t.z.truncate_n(j)),
+//                     _ => panic!("out of range"),
+//                 };
+//                 let sign = if (i + j) & 1 == 1 { -S::one() } else { S::one() };
+//                 mat.determinant() * sign * inv_det
+//             };
+
+//             Some(Matrix4::new(cf(0, 0), cf(0, 1), cf(0, 2), cf(0, 3),
+//                               cf(1, 0), cf(1, 1), cf(1, 2), cf(1, 3),
+//                               cf(2, 0), cf(2, 1), cf(2, 2), cf(2, 3),
+//                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
+//         }
+//     }
+
+//     default fn is_diagonal(&self) -> bool {
+//         ulps_eq!(self[0][1], &S::zero()) &&
+//         ulps_eq!(self[0][2], &S::zero()) &&
+//         ulps_eq!(self[0][3], &S::zero()) &&
+
+//         ulps_eq!(self[1][0], &S::zero()) &&
+//         ulps_eq!(self[1][2], &S::zero()) &&
+//         ulps_eq!(self[1][3], &S::zero()) &&
+
+//         ulps_eq!(self[2][0], &S::zero()) &&
+//         ulps_eq!(self[2][1], &S::zero()) &&
+//         ulps_eq!(self[2][3], &S::zero()) &&
+
+//         ulps_eq!(self[3][0], &S::zero()) &&
+//         ulps_eq!(self[3][1], &S::zero()) &&
+//         ulps_eq!(self[3][2], &S::zero())
+//     }
+
+//     default fn is_symmetric(&self) -> bool {
+//         ulps_eq!(self[0][1], &self[1][0]) &&
+//         ulps_eq!(self[0][2], &self[2][0]) &&
+//         ulps_eq!(self[0][3], &self[3][0]) &&
+
+//         ulps_eq!(self[1][0], &self[0][1]) &&
+//         ulps_eq!(self[1][2], &self[2][1]) &&
+//         ulps_eq!(self[1][3], &self[3][1]) &&
+
+//         ulps_eq!(self[2][0], &self[0][2]) &&
+//         ulps_eq!(self[2][1], &self[1][2]) &&
+//         ulps_eq!(self[2][3], &self[3][2]) &&
+
+//         ulps_eq!(self[3][0], &self[0][3]) &&
+//         ulps_eq!(self[3][1], &self[1][3]) &&
+//         ulps_eq!(self[3][2], &self[2][3])
+//     }
+// }
 
 impl<S: BaseFloat> ApproxEq for Matrix2<S> {
     type Epsilon = S::Epsilon;
@@ -955,10 +1073,6 @@ macro_rules! impl_matrix {
             fn sub_assign(&mut self, other: $MatrixN<S>) { $(self.$field -= other.$field);+ }
         }
 
-        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
-            fn mul(matrix, vector) -> $VectorN<S> { $VectorN::new($(matrix.row($row_index).dot(vector.clone())),+) }
-        });
-
         impl_scalar_ops!($MatrixN<usize> { $($field),+ });
         impl_scalar_ops!($MatrixN<u8> { $($field),+ });
         impl_scalar_ops!($MatrixN<u16> { $($field),+ });
@@ -1001,6 +1115,25 @@ impl_matrix!(Matrix2, Vector2 { x: 0, y: 1 });
 impl_matrix!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
 impl_matrix!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
 
+macro_rules! impl_mv_operator {
+    ($MatrixN:ident, $VectorN:ident { $($field:ident : $row_index:expr),+ }) => {
+        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
+            fn mul(matrix, vector) -> $VectorN<S> {$VectorN::new($(matrix.row($row_index).dot(vector.clone())),+)}
+        });
+    }
+}
+
+impl_mv_operator!(Matrix2, Vector2 { x: 0, y: 1 });
+impl_mv_operator!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
+#[cfg(not(feature = "use_simd"))]
+impl_mv_operator!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
+#[cfg(feature = "use_simd")]
+impl_operator!(<S: BaseFloat> Mul<Vector4<S> > for Matrix4<S> {
+    fn mul(matrix, vector) -> Vector4<S> {
+        matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
+    }
+});
+
 impl_operator!(<S: BaseFloat> Mul<Matrix2<S> > for Matrix2<S> {
     fn mul(lhs, rhs) -> Matrix2<S> {
         Matrix2::new(lhs.row(0).dot(rhs[0]), lhs.row(1).dot(rhs[0]),
@@ -1020,21 +1153,21 @@ impl_operator!(<S: BaseFloat> Mul<Matrix3<S> > for Matrix3<S> {
 // causes the LLVM to miss identical loads and multiplies. This optimization
 // causes the code to be auto vectorized properly increasing the performance
 // around ~4 times.
-macro_rules! dot_matrix4 {
-    ($A:expr, $B:expr, $I:expr, $J:expr) => {
-        ($A[0][$I]) * ($B[$J][0]) +
-        ($A[1][$I]) * ($B[$J][1]) +
-        ($A[2][$I]) * ($B[$J][2]) +
-        ($A[3][$I]) * ($B[$J][3])
-    };
-}
 
 impl_operator!(<S: BaseFloat> Mul<Matrix4<S> > for Matrix4<S> {
     fn mul(lhs, rhs) -> Matrix4<S> {
-        Matrix4::new(dot_matrix4!(lhs, rhs, 0, 0), dot_matrix4!(lhs, rhs, 1, 0), dot_matrix4!(lhs, rhs, 2, 0), dot_matrix4!(lhs, rhs, 3, 0),
-                     dot_matrix4!(lhs, rhs, 0, 1), dot_matrix4!(lhs, rhs, 1, 1), dot_matrix4!(lhs, rhs, 2, 1), dot_matrix4!(lhs, rhs, 3, 1),
-                     dot_matrix4!(lhs, rhs, 0, 2), dot_matrix4!(lhs, rhs, 1, 2), dot_matrix4!(lhs, rhs, 2, 2), dot_matrix4!(lhs, rhs, 3, 2),
-                     dot_matrix4!(lhs, rhs, 0, 3), dot_matrix4!(lhs, rhs, 1, 3), dot_matrix4!(lhs, rhs, 2, 3), dot_matrix4!(lhs, rhs, 3, 3))
+        {
+            let a = lhs[0];
+            let b = lhs[1];
+            let c = lhs[2];
+            let d = lhs[3];
+            Matrix4::from_cols(
+                a*rhs[0][0] + b*rhs[0][1] + c*rhs[0][2] + d*rhs[0][3],
+                a*rhs[1][0] + b*rhs[1][1] + c*rhs[1][2] + d*rhs[1][3],
+                a*rhs[2][0] + b*rhs[2][1] + c*rhs[2][2] + d*rhs[2][3],
+                a*rhs[3][0] + b*rhs[3][1] + c*rhs[3][2] + d*rhs[3][3],
+            )
+        }
     }
 });
 
@@ -1318,3 +1451,39 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
         Matrix4{ x: rng.gen(), y: rng.gen(), z: rng.gen(), w: rng.gen() }
     }
 }
+
+// Sadly buggy.
+// #[cfg(feature = "use_simd")]
+// impl SquareMatrix for Matrix4<f32> {
+//     fn determinant(&self) -> f32 {
+//         let a = Simdf32x4::new(self.z[1], self.x[1], self.w[1], self.y[1]);
+//         let b = Simdf32x4::new(self.y[2], self.y[2], self.z[2], self.z[2]);
+//         let c = Simdf32x4::new(self.x[3], self.z[3], self.x[3], self.z[3]);
+//         let mut tmp = a * (b * c);
+//         let d = Simdf32x4::new(self.y[1], self.y[1], self.z[1], self.z[1]);
+//         let e = Simdf32x4::new(self.x[2], self.z[2], self.x[2], self.z[2]);
+//         let f = Simdf32x4::new(self.z[3], self.x[3], self.w[3], self.y[3]);
+//         let tmp1 = d * (e * f);
+//         tmp = tmp + tmp1;
+//         let g = Simdf32x4::new(self.x[1], self.z[1], self.x[1], self.z[1]);
+//         let h = Simdf32x4::new(self.z[2], self.x[2], self.w[2], self.y[2]);
+//         let i = Simdf32x4::new(self.y[3], self.y[3], self.z[3], self.z[3]);
+//         let tmp1 = g * (h * i);
+//         tmp = tmp + tmp1;
+//         let tmp1 = g * (b * f);
+//         tmp = tmp - tmp1;
+//         let tmp1 = d * (h * c);
+//         tmp = tmp - tmp1;
+//         let tmp1 = a * (e * i);
+//         tmp = tmp - tmp1;
+//         let tmp: Vector4<f32> = (tmp * Simdf32x4::new(self.x[0], self.y[0], self.z[0], self.w[0])).into();
+//         tmp.sum()
+//     }
+// }
+
+// #[cfg(feature = "use_simd")]
+// impl_operator_simd!(@mm Mul<Vector4<f32>> for Matrix4<f32> {
+//     fn mul(matrix, vector) -> Vector4<f32> {
+//         matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
+//     }
+// });
\ No newline at end of file
diff --git a/src/vector.rs b/src/vector.rs
index dbab97b..8e37c83 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -25,6 +25,13 @@ use angle::Rad;
 use approx::ApproxEq;
 use num::{BaseNum, BaseFloat, PartialOrd};
 
+#[cfg(feature = "use_simd")]
+use simd::f32x4 as Simdf32x4;
+#[cfg(feature = "use_simd")]
+use simd::i32x4 as Simdi32x4;
+#[cfg(feature = "use_simd")]
+use simd::u32x4 as Simdu32x4;
+
 /// A 1-dimensional vector.
 ///
 /// This type is marked as `#[repr(C)]`.
@@ -291,6 +298,217 @@ macro_rules! impl_vector {
     }
 }
 
+// Utility macro for generating associated functions for the vectors
+#[cfg(feature = "use_simd")]
+macro_rules! impl_vector_default {
+    ($VectorN:ident { $($field:ident),+ }, $n:expr, $constructor:ident) => {
+        impl<S> $VectorN<S> {
+            /// Construct a new vector, using the provided values.
+            #[inline]
+            pub fn new($($field: S),+) -> $VectorN<S> {
+                $VectorN { $($field: $field),+ }
+            }
+        }
+
+        /// The short constructor.
+        #[inline]
+        pub fn $constructor<S>($($field: S),+) -> $VectorN<S> {
+            $VectorN::new($($field),+)
+        }
+
+        impl<S: NumCast + Copy> $VectorN<S> {
+            /// Component-wise casting to another type
+            #[inline]
+            pub fn cast<T: NumCast>(&self) -> $VectorN<T> {
+                $VectorN { $($field: NumCast::from(self.$field).unwrap()),+ }
+            }
+        }
+
+        impl<S: BaseFloat> MetricSpace for $VectorN<S> {
+            type Metric = S;
+
+            #[inline]
+            fn distance2(self, other: Self) -> S {
+                (other - self).magnitude2()
+            }
+        }
+
+        impl<S: Copy> Array for $VectorN<S> {
+            type Element = S;
+
+            #[inline]
+            fn from_value(scalar: S) -> $VectorN<S> {
+                $VectorN { $($field: scalar),+ }
+            }
+
+            #[inline]
+            fn sum(self) -> S where S: Add<Output = S> {
+                fold_array!(add, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn product(self) -> S where S: Mul<Output = S> {
+                fold_array!(mul, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn min(self) -> S where S: PartialOrd {
+                fold_array!(partial_min, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn max(self) -> S where S: PartialOrd {
+                fold_array!(partial_max, { $(self.$field),+ })
+            }
+        }
+
+        impl<S: BaseNum> Zero for $VectorN<S> {
+            #[inline]
+            fn zero() -> $VectorN<S> {
+                $VectorN::from_value(S::zero())
+            }
+
+            #[inline]
+            fn is_zero(&self) -> bool {
+                *self == $VectorN::zero()
+            }
+        }
+
+        impl<S: BaseNum> VectorSpace for $VectorN<S> {
+            type Scalar = S;
+        }
+
+        impl<S: Neg<Output = S>> Neg for $VectorN<S> {
+            type Output = $VectorN<S>;
+
+            #[inline]
+            default fn neg(self) -> $VectorN<S> { $VectorN::new($(-self.$field),+) }
+        }
+
+        impl<S: BaseFloat> ApproxEq for $VectorN<S> {
+            type Epsilon = S::Epsilon;
+
+            #[inline]
+            fn default_epsilon() -> S::Epsilon {
+                S::default_epsilon()
+            }
+
+            #[inline]
+            fn default_max_relative() -> S::Epsilon {
+                S::default_max_relative()
+            }
+
+            #[inline]
+            fn default_max_ulps() -> u32 {
+                S::default_max_ulps()
+            }
+
+            #[inline]
+            fn relative_eq(&self, other: &Self, epsilon: S::Epsilon, max_relative: S::Epsilon) -> bool {
+                $(S::relative_eq(&self.$field, &other.$field, epsilon, max_relative))&&+
+            }
+
+            #[inline]
+            fn ulps_eq(&self, other: &Self, epsilon: S::Epsilon, max_ulps: u32) -> bool {
+                $(S::ulps_eq(&self.$field, &other.$field, epsilon, max_ulps))&&+
+            }
+        }
+
+        impl<S: BaseFloat + Rand> Rand for $VectorN<S> {
+            #[inline]
+            fn rand<R: Rng>(rng: &mut R) -> $VectorN<S> {
+                $VectorN { $($field: rng.gen()),+ }
+            }
+        }
+
+        impl_operator_default!(<S: BaseNum> Add<$VectorN<S> > for $VectorN<S> {
+            fn add(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field + rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> AddAssign<$VectorN<S> > for $VectorN<S> {
+            fn add_assign(&mut self, other) { $(self.$field += other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Sub<$VectorN<S> > for $VectorN<S> {
+            fn sub(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field - rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> SubAssign<$VectorN<S> > for $VectorN<S> {
+            fn sub_assign(&mut self, other) { $(self.$field -= other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Mul<S> for $VectorN<S> {
+            fn mul(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field * scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> MulAssign<S> for $VectorN<S> {
+            fn mul_assign(&mut self, scalar) { $(self.$field *= scalar);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Div<S> for $VectorN<S> {
+            fn div(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field / scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> DivAssign<S> for $VectorN<S> {
+            fn div_assign(&mut self, scalar) { $(self.$field /= scalar);+ }
+        });
+
+        impl_operator!(<S: BaseNum> Rem<S> for $VectorN<S> {
+            fn rem(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field % scalar),+) }
+        });
+        impl_assignment_operator!(<S: BaseNum> RemAssign<S> for $VectorN<S> {
+            fn rem_assign(&mut self, scalar) { $(self.$field %= scalar);+ }
+        });
+
+        impl<S: BaseNum> ElementWise for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field + rhs.$field),+) }
+            #[inline] default fn sub_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field - rhs.$field),+) }
+            #[inline] default fn mul_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field * rhs.$field),+) }
+            #[inline] default fn div_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field / rhs.$field),+) }
+            #[inline] fn rem_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field % rhs.$field),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field += rhs.$field);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field -= rhs.$field);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field *= rhs.$field);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field /= rhs.$field);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field %= rhs.$field);+ }
+        }
+
+        impl<S: BaseNum> ElementWise<S> for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field + rhs),+) }
+            #[inline] default fn sub_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field - rhs),+) }
+            #[inline] default fn mul_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field * rhs),+) }
+            #[inline] default fn div_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field / rhs),+) }
+            #[inline] fn rem_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field % rhs),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: S) { $(self.$field += rhs);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: S) { $(self.$field -= rhs);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: S) { $(self.$field *= rhs);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: S) { $(self.$field /= rhs);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: S) { $(self.$field %= rhs);+ }
+        }
+
+        impl_scalar_ops!($VectorN<usize> { $($field),+ });
+        impl_scalar_ops!($VectorN<u8> { $($field),+ });
+        impl_scalar_ops!($VectorN<u16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<u32> { $($field),+ });
+        impl_scalar_ops!($VectorN<u64> { $($field),+ });
+        impl_scalar_ops!($VectorN<isize> { $($field),+ });
+        impl_scalar_ops!($VectorN<i8> { $($field),+ });
+        impl_scalar_ops!($VectorN<i16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<i32> { $($field),+ });
+        impl_scalar_ops!($VectorN<i64> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<f32> { $($field),+ });
+        impl_scalar_ops!($VectorN<f64> { $($field),+ });
+
+        impl_index_operators!($VectorN<S>, $n, S, usize);
+        impl_index_operators!($VectorN<S>, $n, [S], Range<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeTo<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFrom<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFull);
+    }
+}
+
 macro_rules! impl_scalar_ops {
     ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
         impl_operator!(Mul<$VectorN<$S>> for $S {
@@ -305,10 +523,28 @@ macro_rules! impl_scalar_ops {
     };
 }
 
+#[cfg(feature = "use_simd")]
+macro_rules! impl_scalar_ops_default {
+    ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
+        impl_operator_default!(Mul<$VectorN<$S>> for $S {
+            fn mul(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar * vector.$field),+) }
+        });
+        impl_operator_default!(Div<$VectorN<$S>> for $S {
+            fn div(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar / vector.$field),+) }
+        });
+        impl_operator_default!(Rem<$VectorN<$S>> for $S {
+            fn rem(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar % vector.$field),+) }
+        });
+    };
+}
+
 impl_vector!(Vector1 { x }, 1, vec1);
 impl_vector!(Vector2 { x, y }, 2, vec2);
 impl_vector!(Vector3 { x, y, z }, 3, vec3);
+#[cfg(not(feature = "use_simd"))]
 impl_vector!(Vector4 { x, y, z, w }, 4, vec4);
+#[cfg(feature = "use_simd")]
+impl_vector_default!(Vector4 { x, y, z, w }, 4, vec4);
 
 impl_fixed_array_conversions!(Vector1<S> { x: 0 }, 1);
 impl_fixed_array_conversions!(Vector2<S> { x: 0, y: 1 }, 2);
@@ -350,7 +586,7 @@ impl<S: BaseNum> Vector2<S> {
     /// Create a `Vector3`, using the `x` and `y` values from this vector, and the
     /// provided `z`.
     #[inline]
-    pub fn extend(self, z: S)-> Vector3<S> {
+    pub fn extend(self, z: S) -> Vector3<S> {
         Vector3::new(self.x, self.y, z)
     }
 }
@@ -386,13 +622,13 @@ impl<S: BaseNum> Vector3<S> {
     /// Create a `Vector4`, using the `x`, `y` and `z` values from this vector, and the
     /// provided `w`.
     #[inline]
-    pub fn extend(self, w: S)-> Vector4<S> {
+    pub fn extend(self, w: S) -> Vector4<S> {
         Vector4::new(self.x, self.y, self.z, w)
     }
 
     /// Create a `Vector2`, dropping the `z` value.
     #[inline]
-    pub fn truncate(self)-> Vector2<S> {
+    pub fn truncate(self) -> Vector2<S> {
         Vector2::new(self.x, self.y)
     }
 }
@@ -424,27 +660,27 @@ impl<S: BaseNum> Vector4<S> {
 
     /// Create a `Vector3`, dropping the `w` value.
     #[inline]
-    pub fn truncate(self)-> Vector3<S> {
+    pub fn truncate(self) -> Vector3<S> {
         Vector3::new(self.x, self.y, self.z)
     }
 
     /// Create a `Vector3`, dropping the nth element
     #[inline]
-    pub fn truncate_n(&self, n: isize)-> Vector3<S> {
+    pub fn truncate_n(&self, n: isize) -> Vector3<S> {
         match n {
             0 => Vector3::new(self.y, self.z, self.w),
             1 => Vector3::new(self.x, self.z, self.w),
             2 => Vector3::new(self.x, self.y, self.w),
             3 => Vector3::new(self.x, self.y, self.z),
-            _ => panic!("{:?} is out of range", n)
+            _ => panic!("{:?} is out of range", n),
         }
     }
 }
 
 /// Dot product of two vectors.
 #[inline]
-pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar where
-    V::Scalar: BaseFloat,
+pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar
+    where V::Scalar: BaseFloat
 {
     V::dot(a, b)
 }
@@ -515,6 +751,371 @@ impl<S: fmt::Debug> fmt::Debug for Vector4<S> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl From<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn from(f: Simdf32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [f32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Vector4<f32> {
+    /// Compute and return the square root of each element.
+    #[inline]
+    pub fn sqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.sqrt().into()
+    }
+
+    /// Compute and return the reciprocal of the square root of each element.
+    #[inline]
+    pub fn rsqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_rsqrt().into()
+    }
+
+    /// Compute and return the reciprocal of each element.
+    #[inline]
+    pub fn recip_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_reciprocal().into()
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn into(self) -> Simdf32x4 {
+        let self_ref: &[f32; 4] = self.as_ref();
+        Simdf32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Sub<Vector4<f32>> for Vector4<f32> {
+        fn sub(lhs, rhs) -> Vector4<f32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Mul<f32> for Vector4<f32> {
+        fn mul(lhs, rhs) -> Vector4<f32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Div<f32> for Vector4<f32> {
+        fn div(lhs, rhs) -> Vector4<f32> {
+            (lhs / rhs).into()
+        }
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Neg for Vector4<f32> {
+        fn neg(lhs) -> Vector4<f32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<f32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<f32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<f32> for Vector4<f32> {
+    fn mul_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl DivAssign<f32> for Vector4<f32> {
+    fn div_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s / other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self + rhs }
+    #[inline] fn sub_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self - rhs }
+    #[inline] fn mul_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s * rhs).into()
+    }
+    #[inline] fn div_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s / rhs).into()
+    }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) += rhs; }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) -= rhs; }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise<f32> for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s + rhs).into()
+    }
+    #[inline] fn sub_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s - rhs).into()
+    }
+    #[inline] fn mul_element_wise(self, rhs: f32) -> Vector4<f32> { self * rhs }
+    #[inline] fn div_element_wise(self, rhs: f32) -> Vector4<f32> { self / rhs }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s + rhs).into();
+    }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s - rhs).into();
+    }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: f32) { (*self) *= rhs; }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: f32) { (*self) /= rhs; }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn from(f: Simdi32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [i32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn into(self) -> Simdi32x4 {
+        let self_ref: &[i32; 4] = self.as_ref();
+        Simdi32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Add<Vector4<i32>> for Vector4<i32> {
+        fn add(lhs, rhs) -> Vector4<i32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Sub<Vector4<i32>> for Vector4<i32> {
+        fn sub(lhs, rhs) -> Vector4<i32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdi32x4]; Mul<i32> for Vector4<i32> {
+        fn mul(lhs, rhs) -> Vector4<i32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Neg for Vector4<i32> {
+        fn neg(lhs) -> Vector4<i32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<i32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<i32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<i32> for Vector4<i32> {
+    fn mul_assign(&mut self, other: i32) {
+        let s: Simdi32x4 = (*self).into();
+        let other = Simdi32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn from(f: Simdu32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [u32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn into(self) -> Simdu32x4 {
+        let self_ref: &[u32; 4] = self.as_ref();
+        Simdu32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Add<Vector4<u32>> for Vector4<u32> {
+        fn add(lhs, rhs) -> Vector4<u32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Vector4<f32>> for Vector4<f32> {
+        fn add(lhs, rhs) -> Vector4<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Sub<Vector4<u32>> for Vector4<u32> {
+        fn sub(lhs, rhs) -> Vector4<u32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdu32x4]; Mul<u32> for Vector4<u32> {
+        fn mul(lhs, rhs) -> Vector4<u32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<u32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<u32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<u32> for Vector4<u32> {
+    fn mul_assign(&mut self, other: u32) {
+        let s: Simdu32x4 = (*self).into();
+        let other = Simdu32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+
 #[cfg(test)]
 mod tests {
     mod vector2 {
@@ -729,7 +1330,12 @@ mod tests {
     mod vector4 {
         use vector::*;
 
-        const VECTOR4: Vector4<i32> = Vector4 { x: 1, y: 2, z: 3, w: 4 };
+        const VECTOR4: Vector4<i32> = Vector4 {
+            x: 1,
+            y: 2,
+            z: 3,
+            w: 4,
+        };
 
         #[test]
         fn test_index() {
@@ -796,11 +1402,11 @@ mod tests {
         fn test_as_mut() {
             let mut v = VECTOR4;
             {
-                let v: &mut[i32; 4] = v.as_mut();
+                let v: &mut [i32; 4] = v.as_mut();
                 assert_eq!(v, &mut [1, 2, 3, 4]);
             }
             {
-                let v: &mut(i32, i32, i32, i32) = v.as_mut();
+                let v: &mut (i32, i32, i32, i32) = v.as_mut();
                 assert_eq!(v, &mut (1, 2, 3, 4));
             }
         }
diff --git a/tests/quaternion.rs b/tests/quaternion.rs
index f59911d..5b00619 100644
--- a/tests/quaternion.rs
+++ b/tests/quaternion.rs
@@ -194,13 +194,13 @@ mod rotate_from_euler {
         let vec = vec3(0.0, 1.0, 0.0);
 
         let rot = Quaternion::from(Euler::new(Deg(90.0), Deg(90.0), Deg(0.0)));
-        assert_ulps_eq!(vec3(0.0, 0.0, 1.0), rot * vec);
+        assert_ulps_eq!(vec3(0.0f32, 0.0f32, 1.0f32), rot * vec);
     }
 
     // tests that the Z rotation is done after the Y
     #[test]
     fn test_y_then_z() {
-        let vec = vec3(0.0, 0.0, 1.0);
+        let vec = vec3(0.0f32, 0.0f32, 1.0f32);
 
         let rot = Quaternion::from(Euler::new(Deg(0.0), Deg(90.0), Deg(90.0)));
         assert_ulps_eq!(vec3(1.0, 0.0, 0.0), rot * vec);
diff --git a/tests/vectorf32.rs b/tests/vectorf32.rs
new file mode 100644
index 0000000..fd930e8
--- /dev/null
+++ b/tests/vectorf32.rs
@@ -0,0 +1,267 @@
+// Copyright 2013-2014 The CGMath Developers. For a full listing of the authors,
+// refer to the Cargo.toml file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0f32 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0f32
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[macro_use]
+extern crate approx;
+#[macro_use]
+extern crate cgmath;
+
+use cgmath::*;
+use std::f32;
+
+#[test]
+fn test_constructor() {
+    assert_eq!(vec2(1f32, 2f32), Vector2::new(1f32, 2f32));
+    assert_eq!(vec3(1f32, 2f32, 3f32), Vector3::new(1f32, 2f32, 3f32));
+    assert_eq!(vec4(1f32, 2f32, 3f32, 4f32), Vector4::new(1f32, 2f32, 3f32, 4f32));
+}
+
+#[test]
+fn test_from_value() {
+    assert_eq!(Vector2::from_value(102f32), Vector2::new(102f32, 102f32));
+    assert_eq!(Vector3::from_value(22f32), Vector3::new(22f32, 22f32, 22f32));
+    assert_eq!(Vector4::from_value(76.5f32), Vector4::new(76.5f32, 76.5f32, 76.5f32, 76.5f32));
+}
+
+macro_rules! impl_test_add {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector + vector ops
+        assert_eq!($v + $v, $VectorN::new($($v.$field + $v.$field),+));
+        assert_eq!(&$v + &$v, $v + $v);
+        assert_eq!(&$v + $v, $v + $v);
+        assert_eq!($v + &$v, $v + $v);
+    )
+}
+
+macro_rules! impl_test_sub {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector - vector ops
+        assert_eq!($v - $v, $VectorN::new($($v.$field - $v.$field),+));
+        assert_eq!(&$v - &$v, $v - $v);
+        assert_eq!(&$v - $v, $v - $v);
+        assert_eq!($v - &$v, $v - $v);
+    )
+}
+
+macro_rules! impl_test_mul {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector * scalar ops
+        assert_eq!($v * $s, $VectorN::new($($v.$field * $s),+));
+        assert_eq!($s * $v, $VectorN::new($($s * $v.$field),+));
+        assert_eq!(&$v * $s, $v * $s);
+        assert_eq!($s * &$v, $s * $v);
+        // commutativity
+        assert_eq!($v * $s, $s * $v);
+    )
+}
+
+macro_rules! impl_test_div {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector / scalar ops
+        assert_eq!($v / $s, $VectorN::new($($v.$field / $s),+));
+        assert_eq!($s / $v, $VectorN::new($($s / $v.$field),+));
+        assert_eq!(&$v / $s, $v / $s);
+        assert_eq!($s / &$v, $s / $v);
+    )
+}
+
+macro_rules! impl_test_rem {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector % scalar ops
+        assert_eq!($v % $s, $VectorN::new($($v.$field % $s),+));
+        assert_eq!($s % $v, $VectorN::new($($s % $v.$field),+));
+        assert_eq!(&$v % $s, $v % $s);
+        assert_eq!($s % &$v, $s % $v);
+    )
+}
+
+#[test]
+fn test_add() {
+    impl_test_add!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_sub() {
+    impl_test_sub!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_sub!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_sub!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_mul() {
+    impl_test_mul!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_mul!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_mul!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_div() {
+    impl_test_div!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_div!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_div!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_rem() {
+    impl_test_rem!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+    impl_test_rem!(Vector3 { x, y, z }, 2.0f32, vec3(2.0f32, 4.0f32, 6.0f32));
+    impl_test_rem!(Vector2 { x, y }, 2.0f32, vec2(2.0f32, 4.0f32));
+}
+
+#[test]
+fn test_dot() {
+    assert_eq!(Vector2::new(1.0f32, 2.0f32).dot(Vector2::new(3.0f32, 4.0f32)), 11.0f32);
+    assert_eq!(Vector3::new(1.0f32, 2.0f32, 3.0f32).dot(Vector3::new(4.0f32, 5.0f32, 6.0f32)), 32.0f32);
+    assert_eq!(Vector4::new(1.0f32, 2.0f32, 3.0f32, 4.0f32).dot(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32)), 70.0f32);
+}
+
+#[test]
+fn test_sum() {
+    assert_eq!(Vector2::new(1f32, 2f32).sum(), 3f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).sum(), 6f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).sum(), 10f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).sum(), 7.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).sum(), 15.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).sum(), 26.0f32);
+}
+
+#[test]
+fn test_product() {
+    assert_eq!(Vector2::new(1f32, 2f32).product(), 2f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).product(), 6f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).product(), 24f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).product(), 12.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).product(), 120.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).product(), 1680.0f32);
+}
+
+#[test]
+fn test_min() {
+    assert_eq!(Vector2::new(1f32, 2f32).min(), 1f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).min(), 1f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).min(), 1f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).min(), 3.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).min(), 4.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).min(), 5.0f32);
+}
+
+#[test]
+fn test_max() {
+    assert_eq!(Vector2::new(1f32, 2f32).max(), 2f32);
+    assert_eq!(Vector3::new(1f32, 2f32, 3f32).max(), 3f32);
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).max(), 4f32);
+
+    assert_eq!(Vector2::new(3.0f32, 4.0f32).max(), 4.0f32);
+    assert_eq!(Vector3::new(4.0f32, 5.0f32, 6.0f32).max(), 6.0f32);
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).max(), 8.0f32);
+}
+
+#[test]
+fn test_cross() {
+    let a = Vector3::new(1f32, 2f32, 3f32);
+    let b = Vector3::new(4f32, 5f32, 6f32);
+    let r = Vector3::new(-3f32, 6f32, -3f32);
+    assert_eq!(a.cross(b), r);
+}
+
+#[test]
+fn test_is_perpendicular() {
+    assert!(Vector2::new(1.0f32, 0.0f32).is_perpendicular(Vector2::new(0.0f32, 1.0f32)));
+    assert!(Vector3::new(0.0f32, 1.0f32, 0.0f32).is_perpendicular(Vector3::new(0.0f32, 0.0f32, 1.0f32)));
+    assert!(Vector4::new(1.0f32, 0.0f32, 0.0f32, 0.0f32).is_perpendicular(Vector4::new(0.0f32, 0.0f32, 0.0f32, 1.0f32)));
+}
+
+#[cfg(test)]
+mod test_magnitude {
+    use cgmath::*;
+
+    #[test]
+    fn test_vector2(){
+        let (a, a_res) = (Vector2::new(3.0f32, 4.0f32), 5.0f32); // (3, 4, 5) Pythagorean triple
+        let (b, b_res) = (Vector2::new(5.0f32, 12.0f32), 13.0f32); // (5, 12, 13) Pythagorean triple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+    }
+
+    #[test]
+    fn test_vector3(){
+        let (a, a_res) = (Vector3::new(2.0f32, 3.0f32, 6.0f32), 7.0f32); // (2, 3, 6, 7) Pythagorean quadruple
+        let (b, b_res) = (Vector3::new(1.0f32, 4.0f32, 8.0f32), 9.0f32); // (1, 4, 8, 9) Pythagorean quadruple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+    }
+
+    #[test]
+    fn test_vector4(){
+        let (a, a_res) = (Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32), 11.0f32); // (1, 2, 4, 10, 11) Pythagorean quintuple
+        let (b, b_res) = (Vector4::new(1.0f32, 2.0f32, 8.0f32, 10.0f32), 13.0f32); // (1, 2, 8, 10, 13) Pythagorean quintuple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+
+        #[cfg(feature = "use_simd")]
+        {
+            let a = Vector4::new(1f32, 4f32, 9f32, 16f32);
+            assert_ulps_eq!(a.sqrt_element_wide(), Vector4::new(1f32, 2f32, 3f32, 4f32));
+            assert_relative_eq!(a.sqrt_element_wide().recip_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative = 0.005f32);
+            assert_relative_eq!(a.rsqrt_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative= 0.005f32);
+        }
+        
+    }
+}
+
+#[test]
+fn test_angle() {
+    assert_ulps_eq!(Vector2::new(1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector2::new(10.0f32, 0.0f32).angle(Vector2::new(0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector2::new(-1.0f32, 0.0f32).angle(Vector2::new(0.0f32, 1.0f32)), &-Rad(f32::consts::FRAC_PI_2));
+
+    assert_ulps_eq!(Vector3::new(1.0f32, 0.0f32, 1.0f32).angle(Vector3::new(1.0f32, 1.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
+    assert_ulps_eq!(Vector3::new(10.0f32, 0.0f32, 10.0f32).angle(Vector3::new(5.0f32, 5.0f32, 0.0f32)), &Rad(f32::consts::FRAC_PI_3));
+    assert_ulps_eq!(Vector3::new(-1.0f32, 0.0f32, -1.0f32).angle(Vector3::new(1.0f32, -1.0f32, 0.0f32)), &Rad(2.0f32 * f32::consts::FRAC_PI_3));
+
+    assert_ulps_eq!(Vector4::new(1.0f32, 0.0f32, 1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(10.0f32, 0.0f32, 10.0f32, 0.0f32).angle(Vector4::new(0.0f32, 5.0f32, 0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(-1.0f32, 0.0f32, -1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+}
+
+#[test]
+fn test_normalize() {
+    // TODO: test normalize_to, normalize_sel.0f32, and normalize_self_to
+    assert_ulps_eq!(Vector2::new(3.0f32, 4.0f32).normalize(), &Vector2::new(3.0f32/5.0f32, 4.0f32/5.0f32));
+    assert_ulps_eq!(Vector3::new(2.0f32, 3.0f32, 6.0f32).normalize(), &Vector3::new(2.0f32/7.0f32, 3.0f32/7.0f32, 6.0f32/7.0f32));
+    assert_ulps_eq!(Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32).normalize(), &Vector4::new(1.0f32/11.0f32, 2.0f32/11.0f32, 4.0f32/11.0f32, 10.0f32/11.0f32));
+}
+
+#[test]
+fn test_cast() {
+    assert_ulps_eq!(Vector2::new(0.9f32, 1.5).cast(), Vector2::new(0.9f32, 1.5));
+    assert_ulps_eq!(Vector3::new(1.0f32, 2.4, -3.13).cast(), Vector3::new(1.0f32, 2.4, -3.13));
+    assert_ulps_eq!(Vector4::new(13.5f32, -4.6, -8.3, 2.41).cast(), Vector4::new(13.5f32, -4.6, -8.3, 2.41));
+}