diff --git a/.travis.yml b/.travis.yml
index 38d6b8d..9768b66 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,4 +6,6 @@ rust:
   - stable
 
 script:
-  - cargo build && cargo test && cargo bench
+  - cargo build && cargo test
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo bench;
+  - if [[ "$TRAVIS_RUST_VERSION" == "nightly" ]]; then cargo build --features "use_simd" && cargo test --features "use_simd" && cargo bench --features "use_simd"
diff --git a/benches/mat.rs b/benches/mat.rs
index f472be2..9d94a72 100644
--- a/benches/mat.rs
+++ b/benches/mat.rs
@@ -59,3 +59,5 @@ bench_unop!(_bench_matrix4_invert, Matrix4<f32>, invert);
 bench_unop!(_bench_matrix2_transpose, Matrix2<f32>, transpose);
 bench_unop!(_bench_matrix3_transpose, Matrix3<f32>, transpose);
 bench_unop!(_bench_matrix4_transpose, Matrix4<f32>, transpose);
+
+bench_unop!(_bench_matrix4_determinant, Matrix4<f32>, determinant);
\ No newline at end of file
diff --git a/src/macros.rs b/src/macros.rs
index ac741e8..e1fa8c6 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -381,21 +381,11 @@ macro_rules! impl_operator_simd {
                 let $x: $Simd = self.into(); $body
             }
         }
-
-        // #[cfg(feature = "simd")]
-        // impl<'a> $Op for &'a $Lhs {
-        //     type Output = $Output;
-        //     #[inline]
-        //     fn $op(self) -> $Output {
-        //         let $x: $Simd = (*self).into(); $body
-        //     }
-        // }
     };
     // When the right operand is a scalar
     (@rs [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
     }) => {
- 
         impl $Op<$Rhs> for $Lhs {
             #[inline]
             fn $op(self, other: $Rhs) -> $Output {
@@ -411,6 +401,7 @@ macro_rules! impl_operator_simd {
             }
         }
     };
+
     // When the right operand is a compound type
     ([$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
@@ -430,7 +421,6 @@ macro_rules! impl_operator_simd {
                 let ($lhs, $rhs): ($Simd, $Simd) = (self.into(), (*other).into()); $body
             }
         }
-
  
         impl<'a> $Op<$Rhs> for &'a $Lhs {
             #[inline]
@@ -439,7 +429,6 @@ macro_rules! impl_operator_simd {
             }
         }
 
- 
         impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
             #[inline]
             fn $op(self, other: &'a $Rhs) -> $Output {
@@ -447,18 +436,17 @@ macro_rules! impl_operator_simd {
             }
         }
     };
+
     // When the left operand is a scalar
     (@ls [$Simd:ident]; $Op:ident<$Rhs:ty> for $Lhs:ident {
         fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body:expr }
     }) => {
- 
         impl $Op<$Rhs> for $Lhs {
             #[inline]
             fn $op(self, other: $Rhs) -> $Output {
                 let ($lhs, $rhs): ($Simd, $Simd) = ($Simd::splat(self), other.into()); $body
             }
         }
-
  
         impl<'a> $Op<&'a $Rhs> for $Lhs {
             #[inline]
@@ -467,47 +455,4 @@ macro_rules! impl_operator_simd {
             }
         }
     };
-
-    // // When left is row-vec, right is colume-matrix
-    // (@vm [$Simd: ident]; $Op:ident<$Rhs:ty> for $Lhs:ty {
-    //     fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty {
-
-    //     }
-    // })
-
-    // When matrix with matrix
-    (@mm $Op:ident<$Rhs:ty> for $Lhs:ty {
-        fn $op:ident($lhs:ident, $rhs:ident) -> $Output:ty { $body: expr }
-    }) => {
-        impl $Op<$Rhs> for $Lhs {
-            #[inline]
-            fn $op(self, other: $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a> $Op<&'a $Rhs> for $Lhs {
-            #[inline]
-            fn $op(self, other: &'a $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a> $Op<$Rhs> for &'a $Lhs {
-            #[inline]
-            fn $op(self, other: $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-
- 
-        impl<'a, 'b> $Op<&'a $Rhs> for &'b $Lhs {
-            #[inline]
-            fn $op(self, other: &'a $Rhs) -> $Output {
-                let ($lhs, $rhs) = (self, other); $body
-            }
-        }
-    }
 }
diff --git a/src/matrix.rs b/src/matrix.rs
index dc9f856..397d2ba 100644
--- a/src/matrix.rs
+++ b/src/matrix.rs
@@ -645,23 +645,10 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
     }
 
     fn determinant(&self) -> S {
-        let m0 = Matrix3::new(self[1][1], self[2][1], self[3][1],
-                              self[1][2], self[2][2], self[3][2],
-                              self[1][3], self[2][3], self[3][3]);
-        let m1 = Matrix3::new(self[0][1], self[2][1], self[3][1],
-                              self[0][2], self[2][2], self[3][2],
-                              self[0][3], self[2][3], self[3][3]);
-        let m2 = Matrix3::new(self[0][1], self[1][1], self[3][1],
-                              self[0][2], self[1][2], self[3][2],
-                              self[0][3], self[1][3], self[3][3]);
-        let m3 = Matrix3::new(self[0][1], self[1][1], self[2][1],
-                              self[0][2], self[1][2], self[2][2],
-                              self[0][3], self[1][3], self[2][3]);
-
-        self[0][0] * m0.determinant() -
-        self[1][0] * m1.determinant() +
-        self[2][0] * m2.determinant() -
-        self[3][0] * m3.determinant()
+        let tmp = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        tmp.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]))
     }
 
     #[inline]
@@ -672,6 +659,12 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                      self[3][3])
     }
 
+    // The new implementation results in negative optimization when used 
+    // without SIMD. so we opt them in with configuration.
+    // A better option would be using specialization. But currently somewhat
+    // specialization is too buggy, and it won't apply here. I'm getting
+    // weird error msgs. Help wanted.
+    #[cfg(not(feature = "use_simd"))]
     fn invert(&self) -> Option<Matrix4<S>> {
         let det = self.determinant();
         if ulps_eq!(det, &S::zero()) { None } else {
@@ -695,6 +688,27 @@ impl<S: BaseFloat> SquareMatrix for Matrix4<S> {
                               cf(3, 0), cf(3, 1), cf(3, 2), cf(3, 3)))
         }
     }
+    #[cfg(feature = "use_simd")]
+    fn invert(&self) -> Option<Matrix4<S>> {
+        let tmp0 = unsafe {
+            det_sub_proc_unsafe(self, 1, 2, 3)
+        };
+        let det = tmp0.dot(Vector4::new(self[0][0], self[1][0], self[2][0], self[3][0]));
+        if ulps_eq!(det, &S::zero()) { None } else {
+            let inv_det = S::one() / det;
+            let tmp0 = tmp0 * inv_det;
+            let tmp1 = unsafe {
+                det_sub_proc_unsafe(self, 0, 3, 2) * inv_det
+            };
+            let tmp2 = unsafe {
+                det_sub_proc_unsafe(self, 0, 1, 3) * inv_det
+            };
+            let tmp3 = unsafe {
+                det_sub_proc_unsafe(self, 0, 2, 1) * inv_det
+            };
+            Some(Matrix4::from_cols(tmp0, tmp1, tmp2, tmp3))
+        }
+    }
 
     fn is_diagonal(&self) -> bool {
         ulps_eq!(self[0][1], &S::zero()) &&
@@ -1036,6 +1050,7 @@ impl_operator!(<S: BaseFloat> Mul<Matrix3<S> > for Matrix3<S> {
 // causes the LLVM to miss identical loads and multiplies. This optimization
 // causes the code to be auto vectorized properly increasing the performance
 // around ~4 times.
+// Update: this should now be a bit more efficient
 
 impl_operator!(<S: BaseFloat> Mul<Matrix4<S> > for Matrix4<S> {
     fn mul(lhs, rhs) -> Matrix4<S> {
@@ -1335,39 +1350,26 @@ impl<S: BaseFloat + Rand> Rand for Matrix4<S> {
     }
 }
 
-// Specialization strangely won't work for this.
-// TODO: find another way
-// #[cfg(feature = "use_simd")]
-// impl SquareMatrix for Matrix4<f32> {
-//     fn determinant(&self) -> f32 {
-//         let a = Simdf32x4::new(self.z[1], self.x[1], self.w[1], self.y[1]);
-//         let b = Simdf32x4::new(self.y[2], self.y[2], self.z[2], self.z[2]);
-//         let c = Simdf32x4::new(self.x[3], self.z[3], self.x[3], self.z[3]);
-//         let mut tmp = a * (b * c);
-//         let d = Simdf32x4::new(self.y[1], self.y[1], self.z[1], self.z[1]);
-//         let e = Simdf32x4::new(self.x[2], self.z[2], self.x[2], self.z[2]);
-//         let f = Simdf32x4::new(self.z[3], self.x[3], self.w[3], self.y[3]);
-//         let tmp1 = d * (e * f);
-//         tmp = tmp + tmp1;
-//         let g = Simdf32x4::new(self.x[1], self.z[1], self.x[1], self.z[1]);
-//         let h = Simdf32x4::new(self.z[2], self.x[2], self.w[2], self.y[2]);
-//         let i = Simdf32x4::new(self.y[3], self.y[3], self.z[3], self.z[3]);
-//         let tmp1 = g * (h * i);
-//         tmp = tmp + tmp1;
-//         let tmp1 = g * (b * f);
-//         tmp = tmp - tmp1;
-//         let tmp1 = d * (h * c);
-//         tmp = tmp - tmp1;
-//         let tmp1 = a * (e * i);
-//         tmp = tmp - tmp1;
-//         let tmp: Vector4<f32> = (tmp * Simdf32x4::new(self.x[0], self.y[0], self.z[0], self.w[0])).into();
-//         tmp.sum()
-//     }
-// }
+// Sub procedure for SIMD when dealing with determinant and inversion
+#[inline]
+unsafe fn det_sub_proc_unsafe<S: BaseFloat>(m: &Matrix4<S>, x: usize, y: usize, z: usize) -> Vector4<S> {
+    let s: &[S; 16] = m.as_ref();
+    let a = Vector4::new(*s.get_unchecked(4 + x), *s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(8 + x));
+    let b = Vector4::new(*s.get_unchecked(8 + y), *s.get_unchecked(8 + y), *s.get_unchecked(4 + y), *s.get_unchecked(4 + y));
+    let c = Vector4::new(*s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(12 + z), *s.get_unchecked(z));
 
-// #[cfg(feature = "use_simd")]
-// impl_operator_simd!(@mm Mul<Vector4<f32>> for Matrix4<f32> {
-//     fn mul(matrix, vector) -> Vector4<f32> {
-//         matrix[0] * vector[0] + matrix[1] * vector[1] + matrix[2] * vector[2] + matrix[3] * vector[3]
-//     }
-// });
\ No newline at end of file
+    let d = Vector4::new(*s.get_unchecked(8 + x), *s.get_unchecked(8 + x), *s.get_unchecked(4 + x), *s.get_unchecked(4 + x));
+    let e = Vector4::new(*s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(12 + y), *s.get_unchecked(y));
+    let f = Vector4::new(*s.get_unchecked(4 + z), *s.get_unchecked(12 + z), *s.get_unchecked(z), *s.get_unchecked(8 + z));
+
+    let g = Vector4::new(*s.get_unchecked(12 + x), *s.get_unchecked(x), *s.get_unchecked(12 + x), *s.get_unchecked(x));
+    let h = Vector4::new(*s.get_unchecked(4 + y), *s.get_unchecked(12 + y), *s.get_unchecked(y), *s.get_unchecked(8 + y));
+    let i = Vector4::new(*s.get_unchecked(8 + z), *s.get_unchecked(8 + z), *s.get_unchecked(4 + z), *s.get_unchecked(4 + z));
+    let mut tmp = a.mul_element_wise(b.mul_element_wise(c));
+    tmp += d.mul_element_wise(e.mul_element_wise(f));
+    tmp += g.mul_element_wise(h.mul_element_wise(i));
+    tmp -= a.mul_element_wise(e.mul_element_wise(i));
+    tmp -= d.mul_element_wise(h.mul_element_wise(c));
+    tmp -= g.mul_element_wise(b.mul_element_wise(f));
+    tmp
+}
\ No newline at end of file