diff --git a/.travis.yml b/.travis.yml
index 38d6b8d..7539fd4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,6 @@ rust:
   - stable
 
 script:
-  - cargo build && cargo test && cargo bench
+  - cargo build && cargo test
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo bench; fi
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo build --features "use_simd" && cargo test --features "use_simd" && cargo bench --features "use_simd"; fi
diff --git a/Cargo.toml b/Cargo.toml
index 12bd240..f470093 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ name = "cgmath"
 unstable = []
 default = ["rustc-serialize"]
 eders = ["serde", "serde_macros"]
+use_simd = ["simd"]
 
 [dependencies]
 approx = "0.1"
@@ -38,6 +39,7 @@ rand = "0.3"
 rustc-serialize = { version = "0.3", optional = true }
 serde = { version = "0.8", optional = true }
 serde_macros = { version = "0.8", optional = true }
+simd = { version = "0.2", optional = true }
 
 [dev-dependencies]
 glium = "0.15"
diff --git a/benches/mat.rs b/benches/mat.rs
index f472be2..88efab9 100644
--- a/benches/mat.rs
+++ b/benches/mat.rs
@@ -59,3 +59,5 @@ bench_unop!(_bench_matrix4_invert, Matrix4<f32>, invert);
 bench_unop!(_bench_matrix2_transpose, Matrix2<f32>, transpose);
 bench_unop!(_bench_matrix3_transpose, Matrix3<f32>, transpose);
 bench_unop!(_bench_matrix4_transpose, Matrix4<f32>, transpose);
+
+bench_unop!(_bench_matrix4_determinant, Matrix4<f32>, determinant);
diff --git a/src/lib.rs b/src/lib.rs
index 822857b..8941603 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,9 +49,9 @@
 //! ```rust
 //! use cgmath::prelude::*;
 //! ```
-
 #![cfg_attr(feature = "eders", feature(plugin, custom_derive))]
 #![cfg_attr(feature = "eders", plugin(serde_macros))]
+#![cfg_attr(feature = "use_simd", feature(specialization))]
 
 #[macro_use]
 extern crate approx;
@@ -64,6 +64,9 @@ extern crate rustc_serialize;
 #[cfg(feature = "eders")]
 extern crate serde;
 
+#[cfg(feature = "use_simd")]
+extern crate simd;
+
 // Re-exports
 
 pub use approx::*;
diff --git a/src/macros.rs b/src/macros.rs
index d9e54de..e1fa8c6 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -254,3 +254,205 @@ macro_rules! impl_index_operators {
         }
     }
 }
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_default {
+    // When it is an unary operator
+    (<$S:ident: $Constraint:ident> $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self) -> $Output {
+                let $x = self; $body
+            }
+        }
+    };
+    // When the right operand is a scalar
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ident> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+          type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the right operand is a compound type
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl<$S: $Constraint> $Op<$Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<&'a $Rhs> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, $S: $Constraint> $Op<$Rhs> for &'a $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a, 'b, $S: $Constraint> $Op<&'a $Rhs> for &'b $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+    // When the left operand is a scalar
+    ($Op:ident<$Rhs:ident<$S:ident>> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl $Op<$Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+
+        impl<'a> $Op<&'a $Rhs<$S>> for $Lhs {
+           type Output = $Output;
+            #[inline]
+            default fn $op(self, other: &'a $Rhs<$S>) -> $Output {
+                let ($lhs, $rhs) = (self, other); $body
+            }
+        }
+    };
+}
+
+#[cfg(feature = "use_simd")]
+macro_rules! impl_assignment_operator_default {
+    (<$S:ident: $Constraint:ident> $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident(&mut $lhs:ident, $rhs:ident) $body:block
+    }) => {
+        impl<$S: $Constraint + $Op<$S>> $Op<$Rhs> for $Lhs {
+            #[inline]
+            default fn $op(&mut $lhs, $rhs: $Rhs) $body
+        }
+    };
+}
+
+/// Generates a binary operator implementation for the permutations of by-ref and by-val, for simd
+#[cfg(feature = "use_simd")]
+macro_rules! impl_operator_simd {
+    // When it is an unary operator
+    ([$Simd:ident]; $Op:ident for $Lhs:ty {
+        fn $op:ident($x:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op for $Lhs {
+            #[inline]
+            fn $op(self) -> $Output {
+                let $x: $Simd = self.into(); $body
+            }
+        }
+    };
+    // When the right operand is a scalar
+    (@rs [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), $Simd::splat(other)); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), $Simd::splat(other)); $body
+            }
+        }
+    };
+
+    // When the right operand is a compound type
+    ([$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+ 
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), other.into()); $body
+            }
+        }
+
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), (*other).into()); $body
+            }
+        }
+ 
+        impl<'a> $Op<$Rhs> for &'a $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), other.into()); $body
+            }
+        }
+
+        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ((*self).into(), (*other).into()); $body
+            }
+        }
+    };
+
+    // When the left operand is a scalar
+    (@ls [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ident {
+        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
+    }) => {
+        impl $Op<$Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), other.into()); $body
+            }
+        }
+ 
+        impl<'a> $Op<&'a $Rhs> for $Lhs {
+            #[inline]
+            fn $op(self, other: &'a $Rhs) -> $Output {
+                let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), (*other).into()); $body
+            }
+        }
+    };
+}
diff --git a/src/matrix.rs b/src/matrix.rs
index 6a1e2fd..e900c66 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -615,6 +615,7 @@ impl<S: BaseFloat> Matrix for Matrix4<S> {
     }
 }
 
+
 impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     type ColumnRow = Vector4<S>;
 
@@ -644,23 +645,10 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn determinant(&self) -> S {
-        let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
-                              self[1][2], self[2][2], self[3][2],
-                              self[1][3], self[2][3], self[3][3]);
-        let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
-                              self[0][2], self[2][2], self[3][2],
-                              self[0][3], self[2][3], self[3][3]);
-        let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
-                              self[0][2], self[1][2], self[3][2],
-                              self[0][3], self[1][3], self[3][3]);
-        let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
-                              self[0][2], self[1][2], self[2][2],
-                              self[0][3], self[1][3], self[2][3]);
-
-        self[0][0] * m0.determinant() -
-        self[1][0] * m1.determinant() +
-        self[2][0] * m2.determinant() -
-        self[3][0] * m3.determinant()
+        let tmp = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        tmp.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]))
     }
 
     #[inline]
@@ -671,6 +659,12 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                      self[3][3])
     }
 
+    // The new implementation results in negative optimization when used 
+    // without SIMD. so we opt them in with configuration.
+    // A better option would be using specialization. But currently somewhat
+    // specialization is too buggy, and it won't apply here. I'm getting
+    // weird error msgs. Help wanted.
+    #[cfg(not(feature = "use_simd"))]
     fn invert(&self) -> Option<Matrix4<S>> {
         let det = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
@@ -694,6 +688,27 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
         }
     }
+    #[cfg(feature = "use_simd")]
+    fn invert(&self) -> Option<Matrix4<S>> {
+        let tmp0 = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        let det = tmp0.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]));
+        if ulps_eq!(det, &S::zero()) { None } else {
+            let inv_det = S::one() / det;
+            let tmp0 = tmp0 * inv_det;
+            let tmp1 = unsafe {
+                det_sub_proc_unsafe(self, 0, 3, 2) * inv_det
+            };
+            let tmp2 = unsafe {
+                det_sub_proc_unsafe(self, 0, 1, 3) * inv_det
+            };
+            let tmp3 = unsafe {
+                det_sub_proc_unsafe(self, 0, 2, 1) * inv_det
+            };
+            Some(Matrix4::from_cols(tmp0, tmp1, tmp2, tmp3))
+        }
+    }
 
     fn is_diagonal(&self) -> bool {
         ulps_eq!(self[0][1], &S::zero()) &&
@@ -955,10 +970,6 @@ macro_rules! impl_matrix {
             fn sub_assign(&mut self, other: $MatrixN<S>) { $(self.$field -= other.$field);+ }
         }
 
-        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
-            fn mul(matrix, vector) -> $VectorN<S> { $VectorN::new($(matrix.row($row_index).dot(vector.clone())),+) }
-        });
-
         impl_scalar_ops!($MatrixN<usize> { $($field),+ });
         impl_scalar_ops!($MatrixN<u8> { $($field),+ });
         impl_scalar_ops!($MatrixN<u16> { $($field),+ });
@@ -1001,6 +1012,25 @@ impl_matrix!(Matrix2, Vector2 { x: 0, y: 1 });
 impl_matrix!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
 impl_matrix!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
 
+macro_rules! impl_mv_operator {
+    ($MatrixN:ident, $VectorN:ident { $($field:ident : $row_index:expr),+ }) => {
+        impl_operator!(<S: BaseFloat> Mul<$VectorN<S> > for $MatrixN<S> {
+            fn mul(matrix, vector) -> $VectorN<S> {$VectorN::new($(matrix.row($row_index).dot(vector.clone())),+)}
+        });
+    }
+}
+
+impl_mv_operator!(Matrix2, Vector2 { x: 0, y: 1 });
+impl_mv_operator!(Matrix3, Vector3 { x: 0, y: 1, z: 2 });
+#[cfg(not(feature = "use_simd"))]
+impl_mv_operator!(Matrix4, Vector4 { x: 0, y: 1, z: 2, w: 3 });
+#[cfg(feature = "use_simd")]
+impl_operator!(<S: BaseFloat> Mul<Vector4<S> > for Matrix4<S> {
+    fn mul(matrix, vector) -> Vector4<S> {
+        matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
+    }
+});
+
 impl_operator!(<S: BaseFloat> Mul<Matrix2<S> > for Matrix2<S> {
     fn mul(lhs, rhs) -> Matrix2<S> {
         Matrix2::new(lhs.row(0).dot(rhs[0]), lhs.row(1).dot(rhs[0]),
@@ -1020,21 +1050,22 @@ impl_operator!(<S: BaseFloat> Mul<Matrix3<S> > for Matrix3<S> {
 // causes the LLVM to miss identical loads and multiplies. This optimization
 // causes the code to be auto vectorized properly increasing the performance
 // around ~4 times.
-macro_rules! dot_matrix4 {
-    ($A:expr, $B:expr, $I:expr, $J:expr) => {
-        ($A[0][$I]) * ($B[$J][0]) +
-        ($A[1][$I]) * ($B[$J][1]) +
-        ($A[2][$I]) * ($B[$J][2]) +
-        ($A[3][$I]) * ($B[$J][3])
-    };
-}
+// Update: this should now be a bit more efficient
 
 impl_operator!(<S: BaseFloat> Mul<Matrix4<S> > for Matrix4<S> {
     fn mul(lhs, rhs) -> Matrix4<S> {
-        Matrix4::new(dot_matrix4!(lhs, rhs, 0, 0), dot_matrix4!(lhs, rhs, 1, 0), dot_matrix4!(lhs, rhs, 2, 0), dot_matrix4!(lhs, rhs, 3, 0),
-                     dot_matrix4!(lhs, rhs, 0, 1), dot_matrix4!(lhs, rhs, 1, 1), dot_matrix4!(lhs, rhs, 2, 1), dot_matrix4!(lhs, rhs, 3, 1),
-                     dot_matrix4!(lhs, rhs, 0, 2), dot_matrix4!(lhs, rhs, 1, 2), dot_matrix4!(lhs, rhs, 2, 2), dot_matrix4!(lhs, rhs, 3, 2),
-                     dot_matrix4!(lhs, rhs, 0, 3), dot_matrix4!(lhs, rhs, 1, 3), dot_matrix4!(lhs, rhs, 2, 3), dot_matrix4!(lhs, rhs, 3, 3))
+        {
+            let a = lhs[0];
+            let b = lhs[1];
+            let c = lhs[2];
+            let d = lhs[3];
+            Matrix4::from_cols(
+                a*rhs[0][0] + b*rhs[0][1] + c*rhs[0][2] + d*rhs[0][3],
+                a*rhs[1][0] + b*rhs[1][1] + c*rhs[1][2] + d*rhs[1][3],
+                a*rhs[2][0] + b*rhs[2][1] + c*rhs[2][2] + d*rhs[2][3],
+                a*rhs[3][0] + b*rhs[3][1] + c*rhs[3][2] + d*rhs[3][3],
+            )
+        }
     }
 });
 
@@ -1318,3 +1349,27 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
         Matrix4{ x: rng.gen(), y: rng.gen(), z: rng.gen(), w: rng.gen() }
     }
 }
+
+// Sub procedure for SIMD when dealing with determinant and inversion
+#[inline]
+unsafe fn det_sub_proc_unsafe<S: BaseFloat>(m: &Matrix4<S>, x: usize, y: usize, z: usize) -> Vector4<S> {
+    let s: &[S; 16] = m.as_ref();
+    let a = Vector4::new(*s.get_unchecked(4 + x), *s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(8 + x));
+    let b = Vector4::new(*s.get_unchecked(8 + y), *s.get_unchecked(8 + y), *s.get_unchecked(4 + y), *s.get_unchecked(4 + y));
+    let c = Vector4::new(*s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(12 + z), *s.get_unchecked(z));
+
+    let d = Vector4::new(*s.get_unchecked(8 + x), *s.get_unchecked(8 + x), *s.get_unchecked(4 + x), *s.get_unchecked(4 + x));
+    let e = Vector4::new(*s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(12 + y), *s.get_unchecked(y));
+    let f = Vector4::new(*s.get_unchecked(4 + z), *s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(8 + z));
+
+    let g = Vector4::new(*s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(12 + x), *s.get_unchecked(x));
+    let h = Vector4::new(*s.get_unchecked(4 + y), *s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(8 + y));
+    let i = Vector4::new(*s.get_unchecked(8 + z), *s.get_unchecked(8 + z), *s.get_unchecked(4 + z), *s.get_unchecked(4 + z));
+    let mut tmp = a.mul_element_wise(b.mul_element_wise(c));
+    tmp += d.mul_element_wise(e.mul_element_wise(f));
+    tmp += g.mul_element_wise(h.mul_element_wise(i));
+    tmp -= a.mul_element_wise(e.mul_element_wise(i));
+    tmp -= d.mul_element_wise(h.mul_element_wise(c));
+    tmp -= g.mul_element_wise(b.mul_element_wise(f));
+    tmp
+}
diff --git a/src/quaternion.rs b/src/quaternion.rs
index c97aea6..384bf1e 100644
--- a/src/quaternion.rs
+++ b/src/quaternion.rs
@@ -30,6 +30,8 @@ use point::Point3;
 use rotation::{Rotation, Rotation3, Basis3};
 use vector::Vector3;
 
+#[cfg(feature = "use_simd")]
+use simd::f32x4 as Simdf32x4;
 
 /// A [quaternion](https://en.wikipedia.org/wiki/Quaternion) in scalar/vector
 /// form.
@@ -46,6 +48,30 @@ pub struct Quaternion<S> {
     pub v: Vector3<S>,
 }
 
+#[cfg(feature = "use_simd")]
+impl From<Simdf32x4> for Quaternion<f32> {
+    #[inline]
+    fn from(f: Simdf32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [f32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdf32x4> for Quaternion<f32> {
+    #[inline]
+    fn into(self) -> Simdf32x4 {
+        let self_ref: &[f32; 4] = self.as_ref();
+        Simdf32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
 impl<S: BaseFloat> Quaternion<S> {
     /// Construct a new quaternion from one scalar component and three
     /// imaginary components
@@ -73,7 +99,7 @@ impl<S: BaseFloat> Quaternion<S> {
         let mag_avg = (src.magnitude2() * dst.magnitude2()).sqrt();
         let dot = src.dot(dst);
         if ulps_eq!(dot, &mag_avg) {
-            Quaternion::one()
+            Quaternion::<S>::one()
         } else if ulps_eq!(dot, &-mag_avg) {
             let axis = fallback.unwrap_or_else(|| {
                 let mut v = Vector3::unit_x().cross(src);
@@ -151,7 +177,7 @@ impl<S: BaseFloat> Zero for Quaternion<S> {
 
     #[inline]
     fn is_zero(&self) -> bool {
-        ulps_eq!(self, &Quaternion::zero())
+        ulps_eq!(self, &Quaternion::<S>::zero())
     }
 }
 
@@ -175,6 +201,7 @@ impl<S: BaseFloat> MetricSpace for Quaternion<S> {
     }
 }
 
+#[cfg(not(feature = "use_simd"))]
 impl<S: BaseFloat> InnerSpace for Quaternion<S> {
     #[inline]
     fn dot(self, other: Quaternion<S>) -> S {
@@ -182,6 +209,25 @@ impl<S: BaseFloat> InnerSpace for Quaternion<S> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl<S: BaseFloat> InnerSpace for Quaternion<S> {
+    #[inline]
+    default fn dot(self, other: Quaternion<S>) -> S {
+        self.s * other.s + self.v.dot(other.v)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl InnerSpace for Quaternion<f32> {
+    #[inline]
+    fn dot(self, other: Quaternion<f32>) -> f32 {
+        let lhs: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = other.into();
+        let r = lhs * rhs;
+        r.extract(0) + r.extract(1) + r.extract(2) + r.extract(3)
+    }
+}
+
 impl<A> From<Euler<A>> for Quaternion<<A as Angle>::Unitless> where
     A: Angle + Into<Rad<<A as Angle>::Unitless>>,
 {
@@ -203,35 +249,119 @@ impl<A> From<Euler<A>> for Quaternion<<A as Angle>::Unitless> where
     }
 }
 
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Neg for Quaternion<S> {
     fn neg(quat) -> Quaternion<S> {
         Quaternion::from_sv(-quat.s, -quat.v)
     }
 });
 
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Neg for Quaternion<S> {
+    fn neg(quat) -> Quaternion<S> {
+        Quaternion::from_sv(-quat.s, -quat.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Neg for Quaternion<f32> {
+        fn neg(lhs) -> Quaternion<f32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Mul<S> for Quaternion<S> {
     fn mul(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s * rhs, lhs.v * rhs)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Mul<S> for Quaternion<S> {
+    fn mul(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s * rhs, lhs.v * rhs)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Mul<f32> for Quaternion<f32> {
+        fn mul(lhs, rhs) -> Quaternion<f32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> MulAssign<S> for Quaternion<S> {
     fn mul_assign(&mut self, scalar) { self.s *= scalar; self.v *= scalar; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> MulAssign<S> for Quaternion<S> {
+    fn mul_assign(&mut self, scalar) { self.s *= scalar; self.v *= scalar; }
+});
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<f32> for Quaternion<f32> {
+    fn mul_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Div<S> for Quaternion<S> {
     fn div(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s / rhs, lhs.v / rhs)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Div<S> for Quaternion<S> {
+    fn div(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s / rhs, lhs.v / rhs)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Div<f32> for Quaternion<f32> {
+        fn div(lhs, rhs) -> Quaternion<f32> {
+            (lhs / rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> DivAssign<S> for Quaternion<S> {
     fn div_assign(&mut self, scalar) { self.s /= scalar; self.v /= scalar; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> DivAssign<S> for Quaternion<S> {
+    fn div_assign(&mut self, scalar) { self.s /= scalar; self.v /= scalar; }
+});
+
+#[cfg(feature = "use_simd")]
+impl DivAssign<f32> for Quaternion<f32> {
+    fn div_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s / other).into();
+    }
+}
+
 impl_operator!(<S: BaseFloat> Rem<S> for Quaternion<S> {
     fn rem(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s % rhs, lhs.v % rhs)
     }
 });
+
 impl_assignment_operator!(<S: BaseFloat> RemAssign<S> for Quaternion<S> {
     fn rem_assign(&mut self, scalar) { self.s %= scalar; self.v %= scalar; }
 });
@@ -245,24 +375,93 @@ impl_operator!(<S: BaseFloat> Mul<Vector3<S> > for Quaternion<S> {
     }}
 });
 
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Add<Quaternion<S> > for Quaternion<S> {
     fn add(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s + rhs.s, lhs.v + rhs.v)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Add<Quaternion<S> > for Quaternion<S> {
+    fn add(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s + rhs.s, lhs.v + rhs.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Quaternion<f32>> for Quaternion<f32> {
+        fn add(lhs, rhs) -> Quaternion<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> AddAssign<Quaternion<S> > for Quaternion<S> {
     fn add_assign(&mut self, other) { self.s += other.s; self.v += other.v; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> AddAssign<Quaternion<S> > for Quaternion<S> {
+    fn add_assign(&mut self, other) { self.s += other.s; self.v += other.v; }
+});
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Quaternion<f32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Sub<Quaternion<S> > for Quaternion<S> {
     fn sub(lhs, rhs) -> Quaternion<S> {
         Quaternion::from_sv(lhs.s - rhs.s, lhs.v - rhs.v)
     }
 });
+
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Sub<Quaternion<S> > for Quaternion<S> {
+    fn sub(lhs, rhs) -> Quaternion<S> {
+        Quaternion::from_sv(lhs.s - rhs.s, lhs.v - rhs.v)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Sub<Quaternion<f32>> for Quaternion<f32> {
+        fn sub(lhs, rhs) -> Quaternion<f32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_assignment_operator!(<S: BaseFloat> SubAssign<Quaternion<S> > for Quaternion<S> {
     fn sub_assign(&mut self, other) { self.s -= other.s; self.v -= other.v; }
 });
 
+#[cfg(feature = "use_simd")]
+impl_assignment_operator_default!(<S: BaseFloat> SubAssign<Quaternion<S> > for Quaternion<S> {
+    fn sub_assign(&mut self, other) { self.s -= other.s; self.v -= other.v; }
+});
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Quaternion<f32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(not(feature = "use_simd"))]
 impl_operator!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
     fn mul(lhs, rhs) -> Quaternion<S> {
         Quaternion::new(lhs.s * rhs.s - lhs.v.x * rhs.v.x - lhs.v.y * rhs.v.y - lhs.v.z * rhs.v.z,
@@ -272,6 +471,37 @@ impl_operator!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
     }
 });
 
+#[cfg(feature = "use_simd")]
+impl_operator_default!(<S: BaseFloat> Mul<Quaternion<S> > for Quaternion<S> {
+    fn mul(lhs, rhs) -> Quaternion<S> {
+        Quaternion::new(lhs.s * rhs.s - lhs.v.x * rhs.v.x - lhs.v.y * rhs.v.y - lhs.v.z * rhs.v.z,
+                        lhs.s * rhs.v.x + lhs.v.x * rhs.s + lhs.v.y * rhs.v.z - lhs.v.z * rhs.v.y,
+                        lhs.s * rhs.v.y + lhs.v.y * rhs.s + lhs.v.z * rhs.v.x - lhs.v.x * rhs.v.z,
+                        lhs.s * rhs.v.z + lhs.v.z * rhs.s + lhs.v.x * rhs.v.y - lhs.v.y * rhs.v.x)
+    }
+});
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Mul<Quaternion<f32>> for Quaternion<f32> {
+        fn mul(lhs, rhs) -> Quaternion<f32> {
+            {
+                let p0 = Simdf32x4::splat(lhs.extract(0)) * rhs;
+                let p1 = Simdf32x4::splat(lhs.extract(1)) * Simdf32x4::new(
+                    -rhs.extract(1), rhs.extract(0), -rhs.extract(3), rhs.extract(2)
+                );
+                let p2 = Simdf32x4::splat(lhs.extract(2)) * Simdf32x4::new(
+                    -rhs.extract(2), rhs.extract(3), rhs.extract(0), -rhs.extract(1)
+                );
+                let p3 = Simdf32x4::splat(lhs.extract(3)) * Simdf32x4::new(
+                    -rhs.extract(3), -rhs.extract(2), rhs.extract(1), rhs.extract(0)
+                );
+                (p0 + p1 + p2 + p3).into()
+            }
+        }
+    }
+}
+
 macro_rules! impl_scalar_mul {
     ($S:ident) => {
         impl_operator!(Mul<Quaternion<$S>> for $S {
diff --git a/src/vector.rs b/src/vector.rs
index dbab97b..a289614 100644
--- a/src/vector.rs
+++ b/src/vector.rs
@@ -25,6 +25,13 @@ use angle::Rad;
 use approx::ApproxEq;
 use num::{BaseNum, BaseFloat, PartialOrd};
 
+#[cfg(feature = "use_simd")]
+use simd::f32x4 as Simdf32x4;
+#[cfg(feature = "use_simd")]
+use simd::i32x4 as Simdi32x4;
+#[cfg(feature = "use_simd")]
+use simd::u32x4 as Simdu32x4;
+
 /// A 1-dimensional vector.
 ///
 /// This type is marked as `#[repr(C)]`.
@@ -291,6 +298,218 @@ macro_rules! impl_vector {
     }
 }
 
+// Utility macro for generating associated functions for the vectors
+// mainly duplication
+#[cfg(feature = "use_simd")]
+macro_rules! impl_vector_default {
+    ($VectorN:ident { $($field:ident),+ }, $n:expr, $constructor:ident) => {
+        impl<S> $VectorN<S> {
+            /// Construct a new vector, using the provided values.
+            #[inline]
+            pub fn new($($field: S),+) -> $VectorN<S> {
+                $VectorN { $($field: $field),+ }
+            }
+        }
+
+        /// The short constructor.
+        #[inline]
+        pub fn $constructor<S>($($field: S),+) -> $VectorN<S> {
+            $VectorN::new($($field),+)
+        }
+
+        impl<S: NumCast + Copy> $VectorN<S> {
+            /// Component-wise casting to another type
+            #[inline]
+            pub fn cast<T: NumCast>(&self) -> $VectorN<T> {
+                $VectorN { $($field: NumCast::from(self.$field).unwrap()),+ }
+            }
+        }
+
+        impl<S: BaseFloat> MetricSpace for $VectorN<S> {
+            type Metric = S;
+
+            #[inline]
+            fn distance2(self, other: Self) -> S {
+                (other - self).magnitude2()
+            }
+        }
+
+        impl<S: Copy> Array for $VectorN<S> {
+            type Element = S;
+
+            #[inline]
+            fn from_value(scalar: S) -> $VectorN<S> {
+                $VectorN { $($field: scalar),+ }
+            }
+
+            #[inline]
+            fn sum(self) -> S where S: Add<Output = S> {
+                fold_array!(add, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn product(self) -> S where S: Mul<Output = S> {
+                fold_array!(mul, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn min(self) -> S where S: PartialOrd {
+                fold_array!(partial_min, { $(self.$field),+ })
+            }
+
+            #[inline]
+            fn max(self) -> S where S: PartialOrd {
+                fold_array!(partial_max, { $(self.$field),+ })
+            }
+        }
+
+        impl<S: BaseNum> Zero for $VectorN<S> {
+            #[inline]
+            fn zero() -> $VectorN<S> {
+                $VectorN::from_value(S::zero())
+            }
+
+            #[inline]
+            fn is_zero(&self) -> bool {
+                *self == $VectorN::zero()
+            }
+        }
+
+        impl<S: BaseNum> VectorSpace for $VectorN<S> {
+            type Scalar = S;
+        }
+
+        impl<S: Neg<Output = S>> Neg for $VectorN<S> {
+            type Output = $VectorN<S>;
+
+            #[inline]
+            default fn neg(self) -> $VectorN<S> { $VectorN::new($(-self.$field),+) }
+        }
+
+        impl<S: BaseFloat> ApproxEq for $VectorN<S> {
+            type Epsilon = S::Epsilon;
+
+            #[inline]
+            fn default_epsilon() -> S::Epsilon {
+                S::default_epsilon()
+            }
+
+            #[inline]
+            fn default_max_relative() -> S::Epsilon {
+                S::default_max_relative()
+            }
+
+            #[inline]
+            fn default_max_ulps() -> u32 {
+                S::default_max_ulps()
+            }
+
+            #[inline]
+            fn relative_eq(&self, other: &Self, epsilon: S::Epsilon, max_relative: S::Epsilon) -> bool {
+                $(S::relative_eq(&self.$field, &other.$field, epsilon, max_relative))&&+
+            }
+
+            #[inline]
+            fn ulps_eq(&self, other: &Self, epsilon: S::Epsilon, max_ulps: u32) -> bool {
+                $(S::ulps_eq(&self.$field, &other.$field, epsilon, max_ulps))&&+
+            }
+        }
+
+        impl<S: BaseFloat + Rand> Rand for $VectorN<S> {
+            #[inline]
+            fn rand<R: Rng>(rng: &mut R) -> $VectorN<S> {
+                $VectorN { $($field: rng.gen()),+ }
+            }
+        }
+
+        impl_operator_default!(<S: BaseNum> Add<$VectorN<S> > for $VectorN<S> {
+            fn add(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field + rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> AddAssign<$VectorN<S> > for $VectorN<S> {
+            fn add_assign(&mut self, other) { $(self.$field += other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Sub<$VectorN<S> > for $VectorN<S> {
+            fn sub(lhs, rhs) -> $VectorN<S> { $VectorN::new($(lhs.$field - rhs.$field),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> SubAssign<$VectorN<S> > for $VectorN<S> {
+            fn sub_assign(&mut self, other) { $(self.$field -= other.$field);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Mul<S> for $VectorN<S> {
+            fn mul(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field * scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> MulAssign<S> for $VectorN<S> {
+            fn mul_assign(&mut self, scalar) { $(self.$field *= scalar);+ }
+        });
+
+        impl_operator_default!(<S: BaseNum> Div<S> for $VectorN<S> {
+            fn div(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field / scalar),+) }
+        });
+
+        impl_assignment_operator_default!(<S: BaseNum> DivAssign<S> for $VectorN<S> {
+            fn div_assign(&mut self, scalar) { $(self.$field /= scalar);+ }
+        });
+
+        impl_operator!(<S: BaseNum> Rem<S> for $VectorN<S> {
+            fn rem(vector, scalar) -> $VectorN<S> { $VectorN::new($(vector.$field % scalar),+) }
+        });
+        impl_assignment_operator!(<S: BaseNum> RemAssign<S> for $VectorN<S> {
+            fn rem_assign(&mut self, scalar) { $(self.$field %= scalar);+ }
+        });
+
+        impl<S: BaseNum> ElementWise for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field + rhs.$field),+) }
+            #[inline] default fn sub_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field - rhs.$field),+) }
+            #[inline] default fn mul_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field * rhs.$field),+) }
+            #[inline] default fn div_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field / rhs.$field),+) }
+            #[inline] fn rem_element_wise(self, rhs: $VectorN<S>) -> $VectorN<S> { $VectorN::new($(self.$field % rhs.$field),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field += rhs.$field);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field -= rhs.$field);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field *= rhs.$field);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field /= rhs.$field);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: $VectorN<S>) { $(self.$field %= rhs.$field);+ }
+        }
+
+        impl<S: BaseNum> ElementWise<S> for $VectorN<S> {
+            #[inline] default fn add_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field + rhs),+) }
+            #[inline] default fn sub_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field - rhs),+) }
+            #[inline] default fn mul_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field * rhs),+) }
+            #[inline] default fn div_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field / rhs),+) }
+            #[inline] fn rem_element_wise(self, rhs: S) -> $VectorN<S> { $VectorN::new($(self.$field % rhs),+) }
+
+            #[inline] default fn add_assign_element_wise(&mut self, rhs: S) { $(self.$field += rhs);+ }
+            #[inline] default fn sub_assign_element_wise(&mut self, rhs: S) { $(self.$field -= rhs);+ }
+            #[inline] default fn mul_assign_element_wise(&mut self, rhs: S) { $(self.$field *= rhs);+ }
+            #[inline] default fn div_assign_element_wise(&mut self, rhs: S) { $(self.$field /= rhs);+ }
+            #[inline] fn rem_assign_element_wise(&mut self, rhs: S) { $(self.$field %= rhs);+ }
+        }
+
+        impl_scalar_ops!($VectorN<usize> { $($field),+ });
+        impl_scalar_ops!($VectorN<u8> { $($field),+ });
+        impl_scalar_ops!($VectorN<u16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<u32> { $($field),+ });
+        impl_scalar_ops!($VectorN<u64> { $($field),+ });
+        impl_scalar_ops!($VectorN<isize> { $($field),+ });
+        impl_scalar_ops!($VectorN<i8> { $($field),+ });
+        impl_scalar_ops!($VectorN<i16> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<i32> { $($field),+ });
+        impl_scalar_ops!($VectorN<i64> { $($field),+ });
+        impl_scalar_ops_default!($VectorN<f32> { $($field),+ });
+        impl_scalar_ops!($VectorN<f64> { $($field),+ });
+
+        impl_index_operators!($VectorN<S>, $n, S, usize);
+        impl_index_operators!($VectorN<S>, $n, [S], Range<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeTo<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFrom<usize>);
+        impl_index_operators!($VectorN<S>, $n, [S], RangeFull);
+    }
+}
+
 macro_rules! impl_scalar_ops {
     ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
         impl_operator!(Mul<$VectorN<$S>> for $S {
@@ -305,10 +524,28 @@ macro_rules! impl_scalar_ops {
     };
 }
 
+#[cfg(feature = "use_simd")]
+macro_rules! impl_scalar_ops_default {
+    ($VectorN:ident<$S:ident> { $($field:ident),+ }) => {
+        impl_operator_default!(Mul<$VectorN<$S>> for $S {
+            fn mul(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar * vector.$field),+) }
+        });
+        impl_operator_default!(Div<$VectorN<$S>> for $S {
+            fn div(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar / vector.$field),+) }
+        });
+        impl_operator_default!(Rem<$VectorN<$S>> for $S {
+            fn rem(scalar, vector) -> $VectorN<$S> { $VectorN::new($(scalar % vector.$field),+) }
+        });
+    };
+}
+
 impl_vector!(Vector1 { x }, 1, vec1);
 impl_vector!(Vector2 { x, y }, 2, vec2);
 impl_vector!(Vector3 { x, y, z }, 3, vec3);
+#[cfg(not(feature = "use_simd"))]
 impl_vector!(Vector4 { x, y, z, w }, 4, vec4);
+#[cfg(feature = "use_simd")]
+impl_vector_default!(Vector4 { x, y, z, w }, 4, vec4);
 
 impl_fixed_array_conversions!(Vector1<S> { x: 0 }, 1);
 impl_fixed_array_conversions!(Vector2<S> { x: 0, y: 1 }, 2);
@@ -350,7 +587,7 @@ impl<S: BaseNum> Vector2<S> {
     /// Create a `Vector3`, using the `x` and `y` values from this vector, and the
     /// provided `z`.
     #[inline]
-    pub fn extend(self, z: S)-> Vector3<S> {
+    pub fn extend(self, z: S) -> Vector3<S> {
         Vector3::new(self.x, self.y, z)
     }
 }
@@ -386,13 +623,13 @@ impl<S: BaseNum> Vector3<S> {
     /// Create a `Vector4`, using the `x`, `y` and `z` values from this vector, and the
     /// provided `w`.
     #[inline]
-    pub fn extend(self, w: S)-> Vector4<S> {
+    pub fn extend(self, w: S) -> Vector4<S> {
         Vector4::new(self.x, self.y, self.z, w)
     }
 
     /// Create a `Vector2`, dropping the `z` value.
     #[inline]
-    pub fn truncate(self)-> Vector2<S> {
+    pub fn truncate(self) -> Vector2<S> {
         Vector2::new(self.x, self.y)
     }
 }
@@ -424,27 +661,27 @@ impl<S: BaseNum> Vector4<S> {
 
     /// Create a `Vector3`, dropping the `w` value.
     #[inline]
-    pub fn truncate(self)-> Vector3<S> {
+    pub fn truncate(self) -> Vector3<S> {
         Vector3::new(self.x, self.y, self.z)
     }
 
     /// Create a `Vector3`, dropping the nth element
     #[inline]
-    pub fn truncate_n(&self, n: isize)-> Vector3<S> {
+    pub fn truncate_n(&self, n: isize) -> Vector3<S> {
         match n {
             0 => Vector3::new(self.y, self.z, self.w),
             1 => Vector3::new(self.x, self.z, self.w),
             2 => Vector3::new(self.x, self.y, self.w),
             3 => Vector3::new(self.x, self.y, self.z),
-            _ => panic!("{:?} is out of range", n)
+            _ => panic!("{:?} is out of range", n),
         }
     }
 }
 
 /// Dot product of two vectors.
 #[inline]
-pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar where
-    V::Scalar: BaseFloat,
+pub fn dot<V: InnerSpace>(a: V, b: V) -> V::Scalar
+    where V::Scalar: BaseFloat
 {
     V::dot(a, b)
 }
@@ -515,6 +752,369 @@ impl<S: fmt::Debug> fmt::Debug for Vector4<S> {
     }
 }
 
+#[cfg(feature = "use_simd")]
+impl From<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn from(f: Simdf32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [f32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Vector4<f32> {
+    /// Compute and return the square root of each element.
+    #[inline]
+    pub fn sqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.sqrt().into()
+    }
+
+    /// Compute and return the reciprocal of the square root of each element.
+    #[inline]
+    pub fn rsqrt_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_rsqrt().into()
+    }
+
+    /// Compute and return the reciprocal of each element.
+    #[inline]
+    pub fn recip_element_wide(self) -> Self {
+        let s: Simdf32x4 = self.into();
+        s.approx_reciprocal().into()
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdf32x4> for Vector4<f32> {
+    #[inline]
+    fn into(self) -> Simdf32x4 {
+        let self_ref: &[f32; 4] = self.as_ref();
+        Simdf32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Add<Vector4<f32>> for Vector4<f32> {
+        fn add(lhs, rhs) -> Vector4<f32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Sub<Vector4<f32>> for Vector4<f32> {
+        fn sub(lhs, rhs) -> Vector4<f32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Mul<f32> for Vector4<f32> {
+        fn mul(lhs, rhs) -> Vector4<f32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdf32x4]; Div<f32> for Vector4<f32> {
+        fn div(lhs, rhs) -> Vector4<f32> {
+            (lhs / rhs).into()
+        }
+    }
+}
+
+
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdf32x4]; Neg for Vector4<f32> {
+        fn neg(lhs) -> Vector4<f32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<f32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<f32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<f32> for Vector4<f32> {
+    fn mul_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl DivAssign<f32> for Vector4<f32> {
+    fn div_assign(&mut self, other: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let other = Simdf32x4::splat(other);
+        *self = (s / other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self + rhs }
+    #[inline] fn sub_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> { self - rhs }
+    #[inline] fn mul_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s * rhs).into()
+    }
+    #[inline] fn div_element_wise(self, rhs: Vector4<f32>) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs: Simdf32x4 = rhs.into();
+        (s / rhs).into()
+    }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) += rhs; }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: Vector4<f32>) { (*self) -= rhs; }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: Vector4<f32>) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs: Simdf32x4 = rhs.into();
+        *self = (s * rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl ElementWise<f32> for Vector4<f32> {
+    #[inline] fn add_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s + rhs).into()
+    }
+    #[inline] fn sub_element_wise(self, rhs: f32) -> Vector4<f32> {
+        let s: Simdf32x4 = self.into();
+        let rhs = Simdf32x4::splat(rhs);
+        (s - rhs).into()
+    }
+    #[inline] fn mul_element_wise(self, rhs: f32) -> Vector4<f32> { self * rhs }
+    #[inline] fn div_element_wise(self, rhs: f32) -> Vector4<f32> { self / rhs }
+
+    #[inline] fn add_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s + rhs).into();
+    }
+    #[inline] fn sub_assign_element_wise(&mut self, rhs: f32) {
+        let s: Simdf32x4 = (*self).into();
+        let rhs = Simdf32x4::splat(rhs);
+        *self = (s - rhs).into();
+    }
+    #[inline] fn mul_assign_element_wise(&mut self, rhs: f32) { (*self) *= rhs; }
+    #[inline] fn div_assign_element_wise(&mut self, rhs: f32) { (*self) /= rhs; }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn from(f: Simdi32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [i32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdi32x4> for Vector4<i32> {
+    #[inline]
+    fn into(self) -> Simdi32x4 {
+        let self_ref: &[i32; 4] = self.as_ref();
+        Simdi32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Add<Vector4<i32>> for Vector4<i32> {
+        fn add(lhs, rhs) -> Vector4<i32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Sub<Vector4<i32>> for Vector4<i32> {
+        fn sub(lhs, rhs) -> Vector4<i32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdi32x4]; Mul<i32> for Vector4<i32> {
+        fn mul(lhs, rhs) -> Vector4<i32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdi32x4]; Neg for Vector4<i32> {
+        fn neg(lhs) -> Vector4<i32> {
+            (-lhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<i32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<i32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdi32x4 = (*self).into();
+        let rhs: Simdi32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<i32> for Vector4<i32> {
+    fn mul_assign(&mut self, other: i32) {
+        let s: Simdi32x4 = (*self).into();
+        let other = Simdi32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl From<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn from(f: Simdu32x4) -> Self {
+        unsafe {
+            let mut ret: Self = mem::uninitialized();
+            {
+                let ret_mut: &mut [u32; 4] = ret.as_mut();
+                f.store(ret_mut.as_mut(), 0 as usize);
+            }
+            ret
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl Into<Simdu32x4> for Vector4<u32> {
+    #[inline]
+    fn into(self) -> Simdu32x4 {
+        let self_ref: &[u32; 4] = self.as_ref();
+        Simdu32x4::load(self_ref.as_ref(), 0 as usize)
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Add<Vector4<u32>> for Vector4<u32> {
+        fn add(lhs, rhs) -> Vector4<u32> {
+            (lhs + rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{
+    [Simdu32x4]; Sub<Vector4<u32>> for Vector4<u32> {
+        fn sub(lhs, rhs) -> Vector4<u32> {
+            (lhs - rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl_operator_simd!{@rs
+    [Simdu32x4]; Mul<u32> for Vector4<u32> {
+        fn mul(lhs, rhs) -> Vector4<u32> {
+            (lhs * rhs).into()
+        }
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl AddAssign for Vector4<u32> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s + rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl SubAssign for Vector4<u32> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        let s: Simdu32x4 = (*self).into();
+        let rhs: Simdu32x4 = rhs.into();
+        *self = (s - rhs).into();
+    }
+}
+
+#[cfg(feature = "use_simd")]
+impl MulAssign<u32> for Vector4<u32> {
+    fn mul_assign(&mut self, other: u32) {
+        let s: Simdu32x4 = (*self).into();
+        let other = Simdu32x4::splat(other);
+        *self = (s * other).into();
+    }
+}
+
+
 #[cfg(test)]
 mod tests {
     mod vector2 {
@@ -729,7 +1329,12 @@ mod tests {
     mod vector4 {
         use vector::*;
 
-        const VECTOR4: Vector4<i32> = Vector4 { x: 1, y: 2, z: 3, w: 4 };
+        const VECTOR4: Vector4<i32> = Vector4 {
+            x: 1,
+            y: 2,
+            z: 3,
+            w: 4,
+        };
 
         #[test]
         fn test_index() {
@@ -796,11 +1401,11 @@ mod tests {
         fn test_as_mut() {
             let mut v = VECTOR4;
             {
-                let v: &mut[i32; 4] = v.as_mut();
+                let v: &mut [i32; 4] = v.as_mut();
                 assert_eq!(v, &mut [1, 2, 3, 4]);
             }
             {
-                let v: &mut(i32, i32, i32, i32) = v.as_mut();
+                let v: &mut (i32, i32, i32, i32) = v.as_mut();
                 assert_eq!(v, &mut (1, 2, 3, 4));
             }
         }
diff --git a/tests/quaternion.rs b/tests/quaternion.rs
index f59911d..5b00619 100644
--- a/tests/quaternion.rs
+++ b/tests/quaternion.rs
@@ -194,13 +194,13 @@ mod rotate_from_euler {
         let vec = vec3(0.0, 1.0, 0.0);
 
         let rot = Quaternion::from(Euler::new(Deg(90.0), Deg(90.0), Deg(0.0)));
-        assert_ulps_eq!(vec3(0.0, 0.0, 1.0), rot * vec);
+        assert_ulps_eq!(vec3(0.0f32, 0.0f32, 1.0f32), rot * vec);
     }
 
     // tests that the Z rotation is done after the Y
     #[test]
     fn test_y_then_z() {
-        let vec = vec3(0.0, 0.0, 1.0);
+        let vec = vec3(0.0f32, 0.0f32, 1.0f32);
 
         let rot = Quaternion::from(Euler::new(Deg(0.0), Deg(90.0), Deg(90.0)));
         assert_ulps_eq!(vec3(1.0, 0.0, 0.0), rot * vec);
diff --git a/tests/vector4f32.rs b/tests/vector4f32.rs
new file mode 100644
index 0000000..b860d76
--- /dev/null
+++ b/tests/vector4f32.rs
@@ -0,0 +1,191 @@
+// Copyright 2013-2014 The CGMath Developers. For a full listing of the authors,
+// refer to the Cargo.toml file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0f32 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0f32
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[macro_use]
+extern crate approx;
+#[macro_use]
+extern crate cgmath;
+
+use cgmath::*;
+use std::f32;
+
+#[test]
+fn test_constructor() {
+    assert_eq!(vec4(1f32, 2f32, 3f32, 4f32), Vector4::new(1f32, 2f32, 3f32, 4f32));
+}
+
+#[test]
+fn test_from_value() {
+    assert_eq!(Vector4::from_value(76.5f32), Vector4::new(76.5f32, 76.5f32, 76.5f32, 76.5f32));
+}
+
+macro_rules! impl_test_add {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector + vector ops
+        assert_eq!($v + $v, $VectorN::new($($v.$field + $v.$field),+));
+        assert_eq!(&$v + &$v, $v + $v);
+        assert_eq!(&$v + $v, $v + $v);
+        assert_eq!($v + &$v, $v + $v);
+    )
+}
+
+macro_rules! impl_test_sub {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector - vector ops
+        assert_eq!($v - $v, $VectorN::new($($v.$field - $v.$field),+));
+        assert_eq!(&$v - &$v, $v - $v);
+        assert_eq!(&$v - $v, $v - $v);
+        assert_eq!($v - &$v, $v - $v);
+    )
+}
+
+macro_rules! impl_test_mul {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector * scalar ops
+        assert_eq!($v * $s, $VectorN::new($($v.$field * $s),+));
+        assert_eq!($s * $v, $VectorN::new($($s * $v.$field),+));
+        assert_eq!(&$v * $s, $v * $s);
+        assert_eq!($s * &$v, $s * $v);
+        // commutativity
+        assert_eq!($v * $s, $s * $v);
+    )
+}
+
+macro_rules! impl_test_div {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector / scalar ops
+        assert_eq!($v / $s, $VectorN::new($($v.$field / $s),+));
+        assert_eq!($s / $v, $VectorN::new($($s / $v.$field),+));
+        assert_eq!(&$v / $s, $v / $s);
+        assert_eq!($s / &$v, $s / $v);
+    )
+}
+
+macro_rules! impl_test_rem {
+    ($VectorN:ident { $($field:ident),+ }, $s:expr, $v:expr) => (
+        // vector % scalar ops
+        assert_eq!($v % $s, $VectorN::new($($v.$field % $s),+));
+        assert_eq!($s % $v, $VectorN::new($($s % $v.$field),+));
+        assert_eq!(&$v % $s, $v % $s);
+        assert_eq!($s % &$v, $s % $v);
+    )
+}
+
+#[test]
+fn test_add() {
+    impl_test_add!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_sub() {
+    impl_test_sub!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_mul() {
+    impl_test_mul!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_div() {
+    impl_test_div!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_rem() {
+    impl_test_rem!(Vector4 { x, y, z, w }, 2.0f32, vec4(2.0f32, 4.0f32, 6.0f32, 8.0f32));
+}
+
+#[test]
+fn test_dot() {
+    assert_eq!(Vector4::new(1.0f32, 2.0f32, 3.0f32, 4.0f32).dot(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32)), 70.0f32);
+}
+
+#[test]
+fn test_sum() {
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).sum(), 10f32);
+
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).sum(), 26.0f32);
+}
+
+#[test]
+fn test_product() {
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).product(), 24f32);
+
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).product(), 1680.0f32);
+}
+
+#[test]
+fn test_min() {
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).min(), 1f32);
+
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).min(), 5.0f32);
+}
+
+#[test]
+fn test_max() {
+    assert_eq!(Vector4::new(1f32, 2f32, 3f32, 4f32).max(), 4f32);
+
+    assert_eq!(Vector4::new(5.0f32, 6.0f32, 7.0f32, 8.0f32).max(), 8.0f32);
+}
+
+#[test]
+fn test_is_perpendicular() {
+    assert!(Vector4::new(1.0f32, 0.0f32, 0.0f32, 0.0f32).is_perpendicular(Vector4::new(0.0f32, 0.0f32, 0.0f32, 1.0f32)));
+}
+
+#[cfg(test)]
+mod test_magnitude {
+    use cgmath::*;
+
+    #[test]
+    fn test_vector4(){
+        let (a, a_res) = (Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32), 11.0f32); // (1, 2, 4, 10, 11) Pythagorean quintuple
+        let (b, b_res) = (Vector4::new(1.0f32, 2.0f32, 8.0f32, 10.0f32), 13.0f32); // (1, 2, 8, 10, 13) Pythagorean quintuple
+
+        assert_eq!(a.magnitude2(), a_res * a_res);
+        assert_eq!(b.magnitude2(), b_res * b_res);
+
+        assert_eq!(a.magnitude(), a_res);
+        assert_eq!(b.magnitude(), b_res);
+
+        #[cfg(feature = "use_simd")]
+        {
+            let a = Vector4::new(1f32, 4f32, 9f32, 16f32);
+            assert_ulps_eq!(a.sqrt_element_wide(), Vector4::new(1f32, 2f32, 3f32, 4f32));
+            assert_relative_eq!(a.sqrt_element_wide().recip_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative = 0.005f32);
+            assert_relative_eq!(a.rsqrt_element_wide(), Vector4::new(1f32, 1f32/2f32, 1f32/3f32, 1f32/4f32), max_relative= 0.005f32);
+        }
+        
+    }
+}
+
+#[test]
+fn test_angle() {
+    assert_ulps_eq!(Vector4::new(1.0f32, 0.0f32, 1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(10.0f32, 0.0f32, 10.0f32, 0.0f32).angle(Vector4::new(0.0f32, 5.0f32, 0.0f32, 5.0f32)), &Rad(f32::consts::FRAC_PI_2));
+    assert_ulps_eq!(Vector4::new(-1.0f32, 0.0f32, -1.0f32, 0.0f32).angle(Vector4::new(0.0f32, 1.0f32, 0.0f32, 1.0f32)), &Rad(f32::consts::FRAC_PI_2));
+}
+
+#[test]
+fn test_normalize() {
+    // TODO: test normalize_to, normalize_sel.0f32, and normalize_self_to
+    assert_ulps_eq!(Vector4::new(1.0f32, 2.0f32, 4.0f32, 10.0f32).normalize(), &Vector4::new(1.0f32/11.0f32, 2.0f32/11.0f32, 4.0f32/11.0f32, 10.0f32/11.0f32));
+}
+
+#[test]
+fn test_cast() {
+    assert_ulps_eq!(Vector4::new(13.5f32, -4.6, -8.3, 2.41).cast(), Vector4::new(13.5f32, -4.6, -8.3, 2.41));
+}