From e4689ce22badbf64f5621233882423c78d739b80 Mon Sep 17 00:00:00 2001
From: Colin Sherratt <colin.sherratt@gmail.com>
Date: Tue, 26 Nov 2013 03:56:18 -0500
Subject: [PATCH 1/2] Improved the performance of mat4 multiply.

---
 src/cgmath/matrix.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/cgmath/matrix.rs b/src/cgmath/matrix.rs
index 7d3642a..4e93534 100644
--- a/src/cgmath/matrix.rs
+++ b/src/cgmath/matrix.rs
@@ -458,15 +458,23 @@ for Mat3<S>
     }
 }
 
+macro_rules! dot_mat4(
+    ($A:expr, $B:expr, $I:expr, $J:expr) => (
+        *$A.cr(0, $I) * *$B.cr($J, 0) +
+        *$A.cr(1, $I) * *$B.cr($J, 1) +
+        *$A.cr(2, $I) * *$B.cr($J, 2) +
+        *$A.cr(3, $I) * *$B.cr($J, 3)
+))
+
 impl<S: Float>
 Matrix<S, [Vec4<S>, ..4], Vec4<S>, [S, ..4]>
 for Mat4<S>
 {
     fn mul_m(&self, other: &Mat4<S>) -> Mat4<S> {
-        Mat4::new(self.r(0).dot(other.c(0)), self.r(1).dot(other.c(0)), self.r(2).dot(other.c(0)), self.r(3).dot(other.c(0)),
-                  self.r(0).dot(other.c(1)), self.r(1).dot(other.c(1)), self.r(2).dot(other.c(1)), self.r(3).dot(other.c(1)),
-                  self.r(0).dot(other.c(2)), self.r(1).dot(other.c(2)), self.r(2).dot(other.c(2)), self.r(3).dot(other.c(2)),
-                  self.r(0).dot(other.c(3)), self.r(1).dot(other.c(3)), self.r(2).dot(other.c(3)), self.r(3).dot(other.c(3)))
+        Mat4::new(dot_mat4!(self, other, 0, 0), dot_mat4!(self, other, 1, 0), dot_mat4!(self, other, 2, 0), dot_mat4!(self, other, 3, 0),
+                  dot_mat4!(self, other, 0, 1), dot_mat4!(self, other, 1, 1), dot_mat4!(self, other, 2, 1), dot_mat4!(self, other, 3, 1),
+                  dot_mat4!(self, other, 0, 2), dot_mat4!(self, other, 1, 2), dot_mat4!(self, other, 2, 2), dot_mat4!(self, other, 3, 2),
+                  dot_mat4!(self, other, 0, 3), dot_mat4!(self, other, 1, 3), dot_mat4!(self, other, 2, 3), dot_mat4!(self, other, 3, 3))
     }
 
     fn transpose(&self) -> Mat4<S> {

From 149c781e81993889f59476411a6e8177cc12b5be Mon Sep 17 00:00:00 2001
From: Colin Sherratt <colin.sherratt@gmail.com>
Date: Wed, 27 Nov 2013 12:20:13 -0500
Subject: [PATCH 2/2] Documented the mat4 multiply optimization.

---
 src/cgmath/matrix.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/cgmath/matrix.rs b/src/cgmath/matrix.rs
index 4e93534..23437b0 100644
--- a/src/cgmath/matrix.rs
+++ b/src/cgmath/matrix.rs
@@ -458,12 +458,16 @@ for Mat3<S>
     }
 }
 
+// Using self.r(0).dot(other.c(0)) like the other matrix multiplies
+// causes the LLVM to miss identical loads and multiplies. This optimization
+// causes the code to be auto vectorized properly increasing the performance
+// around ~4 times.
 macro_rules! dot_mat4(
     ($A:expr, $B:expr, $I:expr, $J:expr) => (
-        *$A.cr(0, $I) * *$B.cr($J, 0) +
-        *$A.cr(1, $I) * *$B.cr($J, 1) +
-        *$A.cr(2, $I) * *$B.cr($J, 2) +
-        *$A.cr(3, $I) * *$B.cr($J, 3)
+        (*$A.cr(0, $I)) * (*$B.cr($J, 0)) +
+        (*$A.cr(1, $I)) * (*$B.cr($J, 1)) +
+        (*$A.cr(2, $I)) * (*$B.cr($J, 2)) +
+        (*$A.cr(3, $I)) * (*$B.cr($J, 3))
 ))
 
 impl<S: Float>