nalgebra/src/core/blas.rs

use std::mem;
use num::{Zero, One, Signed};
use matrixmultiply;
use alga::general::{ClosedMul, ClosedAdd};

use core::{Scalar, Matrix, Vector};
use core::dimension::{Dim, U1, U2, U3, U4, Dynamic};
use core::constraint::{ShapeConstraint, SameNumberOfRows, SameNumberOfColumns, AreMultipliable, DimEq};
use core::storage::{Storage, StorageMut};


impl<N: Scalar + PartialOrd + Signed, D: Dim, S: Storage<N, D>> Vector<N, D, S> {
    /// Computes the index of the vector component with the largest absolute value.
    #[inline]
    pub fn iamax(&self) -> usize {
        assert!(!self.is_empty(), "The input vector must not be empty.");

        let mut the_max = unsafe { self.vget_unchecked(0).abs() };
        let mut the_i   = 0;

        for i in 1 .. self.nrows() {
            let val = unsafe { self.vget_unchecked(i).abs() };

            if val > the_max {
                the_max = val;
                the_i   = i;
            }
        }

        the_i
    }
}

impl<N: Scalar + PartialOrd + Signed, R: Dim, C: Dim, S: Storage<N, R, C>> Matrix<N, R, C, S> {
    /// Computes the index of the matrix component with the largest absolute value.
    #[inline]
    pub fn iamax_full(&self) -> (usize, usize) {
        assert!(!self.is_empty(), "The input matrix must not be empty.");

        let mut the_max = unsafe { self.get_unchecked(0, 0).abs() };
        let mut the_ij  = (0, 0);

        for j in 0 .. self.ncols() {
            for i in 0 .. self.nrows() {
                let val = unsafe { self.get_unchecked(i, j).abs() };

                if val > the_max {
                    the_max = val;
                    the_ij  = (i, j);
                }
            }
        }

        the_ij
    }
}

impl<N, R: Dim, C: Dim, S: Storage<N, R, C>> Matrix<N, R, C, S>
    where N: Scalar + Zero + ClosedAdd + ClosedMul {
    /// The dot product between two matrices (seen as vectors).
    ///
    /// Note that this is **not** the matrix multiplication as in, e.g., numpy. For matrix
    /// multiplication, use one of: `.gemm`, `mul_to`, `.mul`, `*`.
    #[inline]
    pub fn dot<R2: Dim, C2: Dim, SB>(&self, rhs: &Matrix<N, R2, C2, SB>) -> N
        where SB: Storage<N, R2, C2>,
              ShapeConstraint: DimEq<R, R2> + DimEq<C, C2> {
        assert!(self.nrows() == rhs.nrows(), "Dot product dimensions mismatch.");


        // So we do some special cases for common fixed-size vectors of dimension lower than 8
        // because the `for` loop bellow won't be very efficient on those.
        if (R::is::<U2>() || R2::is::<U2>()) &&
           (C::is::<U1>() || C2::is::<U1>()) {
            unsafe {
                let a = *self.get_unchecked(0, 0) * *rhs.get_unchecked(0, 0);
                let b = *self.get_unchecked(1, 0) * *rhs.get_unchecked(1, 0);

                return a + b;
            }
        }
        if (R::is::<U3>() || R2::is::<U3>()) &&
           (C::is::<U1>() || C2::is::<U1>()) {
            unsafe {
                let a = *self.get_unchecked(0, 0) * *rhs.get_unchecked(0, 0);
                let b = *self.get_unchecked(1, 0) * *rhs.get_unchecked(1, 0);
                let c = *self.get_unchecked(2, 0) * *rhs.get_unchecked(2, 0);

                return a + b + c;
            }
        }
        if (R::is::<U4>() || R2::is::<U4>()) &&
           (C::is::<U1>() || C2::is::<U1>()) {
            unsafe {
                let mut a = *self.get_unchecked(0, 0) * *rhs.get_unchecked(0, 0);
                let mut b = *self.get_unchecked(1, 0) * *rhs.get_unchecked(1, 0);
                let c = *self.get_unchecked(2, 0) * *rhs.get_unchecked(2, 0);
                let d = *self.get_unchecked(3, 0) * *rhs.get_unchecked(3, 0);

                a += c;
                b += d;

                return a + b;
            }
        }


        // All this is inspired from the "unrolled version" discussed in:
        // http://blog.theincredibleholk.org/blog/2012/12/10/optimizing-dot-product/
        //
        // And this comment from bluss:
        // https://users.rust-lang.org/t/how-to-zip-two-slices-efficiently/2048/12
        let mut res = N::zero();

        // We have to define them outside of the loop (and not inside at first assignment)
        // otherwize vectorization won't kick in for some reason.
        let mut acc0;
        let mut acc1;
        let mut acc2;
        let mut acc3;
        let mut acc4;
        let mut acc5;
        let mut acc6;
        let mut acc7;

        for j in 0 .. self.ncols() {
            let mut i = 0;

            acc0 = N::zero();
            acc1 = N::zero();
            acc2 = N::zero();
            acc3 = N::zero();
            acc4 = N::zero();
            acc5 = N::zero();
            acc6 = N::zero();
            acc7 = N::zero();

            while self.nrows() - i >= 8 {
                acc0 += unsafe { *self.get_unchecked(i + 0, j) * *rhs.get_unchecked(i + 0, j) };
                acc1 += unsafe { *self.get_unchecked(i + 1, j) * *rhs.get_unchecked(i + 1, j) };
                acc2 += unsafe { *self.get_unchecked(i + 2, j) * *rhs.get_unchecked(i + 2, j) };
                acc3 += unsafe { *self.get_unchecked(i + 3, j) * *rhs.get_unchecked(i + 3, j) };
                acc4 += unsafe { *self.get_unchecked(i + 4, j) * *rhs.get_unchecked(i + 4, j) };
                acc5 += unsafe { *self.get_unchecked(i + 5, j) * *rhs.get_unchecked(i + 5, j) };
                acc6 += unsafe { *self.get_unchecked(i + 6, j) * *rhs.get_unchecked(i + 6, j) };
                acc7 += unsafe { *self.get_unchecked(i + 7, j) * *rhs.get_unchecked(i + 7, j) };
                i += 8;
            }

            res += acc0 + acc4;
            res += acc1 + acc5;
            res += acc2 + acc6;
            res += acc3 + acc7;

            for k in i .. self.nrows() {
                res += unsafe { *self.get_unchecked(k, j) * *rhs.get_unchecked(k, j) }
            }
        }

        res
    }

    /// The dot product between the transpose of `self` and `rhs`.
    #[inline]
    pub fn tr_dot<R2: Dim, C2: Dim, SB>(&self, rhs: &Matrix<N, R2, C2, SB>) -> N
        where SB: Storage<N, R2, C2>,
              ShapeConstraint: DimEq<C, R2> + DimEq<R, C2> {
        let (nrows, ncols) = self.shape();
        assert!((ncols, nrows) == rhs.shape(), "Transposed dot product dimension mismatch.");

        let mut res = N::zero();

        for j in 0 .. self.nrows() {
            for i in 0 .. self.ncols() {
                res += unsafe { *self.get_unchecked(j, i) * *rhs.get_unchecked(i, j) }
            }
        }

        res
    }
}

fn array_axpy<N>(y: &mut [N], a: N, x: &[N], beta: N, stride1: usize, stride2: usize, len: usize)
    where N: Scalar + Zero + ClosedAdd + ClosedMul {
    for i in 0 .. len {
        unsafe {
            let y = y.get_unchecked_mut(i * stride1);
            *y = a * *x.get_unchecked(i * stride2) + beta * *y;
        }
    }
}

fn array_ax<N>(y: &mut [N], a: N, x: &[N], stride1: usize, stride2: usize, len: usize)
    where N: Scalar + Zero + ClosedAdd + ClosedMul {
    for i in 0 .. len {
        unsafe {
            *y.get_unchecked_mut(i * stride1) = a * *x.get_unchecked(i * stride2);
        }
    }
}

impl<N, D: Dim, S> Vector<N, D, S>
    where N: Scalar + Zero + ClosedAdd + ClosedMul,
          S: StorageMut<N, D> {
    /// Computes `self = a * x + b * self`.
    ///
    /// If be is zero, `self` is never read from.
    #[inline]
    pub fn axpy<D2: Dim, SB>(&mut self, a: N, x: &Vector<N, D2, SB>, b: N)
        where SB: Storage<N, D2>,
              ShapeConstraint: DimEq<D, D2> {

        assert_eq!(self.nrows(), x.nrows(), "Axpy: mismatched vector shapes.");

        let rstride1 = self.strides().0;
        let rstride2 = x.strides().0;

        let y = self.data.as_mut_slice();
        let x = x.data.as_slice();

        if !b.is_zero() {
            array_axpy(y, a, x, b, rstride1, rstride2, x.len());
        }
        else {
            array_ax(y, a, x, rstride1, rstride2, x.len());
        }
    }

    /// Computes `self = alpha * a * x + beta * self`, where `a` is a matrix, `x` a vector, and
    /// `alpha, beta` two scalars.
    ///
    /// If `beta` is zero, `self` is never read.
    #[inline]
    pub fn gemv<R2: Dim, C2: Dim, D3: Dim, SB, SC>(&mut self,
                                                   alpha: N,
                                                   a:     &Matrix<N, R2, C2, SB>,
                                                   x:     &Vector<N, D3, SC>,
                                                   beta:  N)
        where N:  One,
              SB: Storage<N, R2, C2>,
              SC: Storage<N, D3>,
              ShapeConstraint: DimEq<D, R2> +
                               AreMultipliable<R2, C2, D3, U1> {
        let dim1 = self.nrows();
        let (nrows2, ncols2) = a.shape();
        let dim3 = x.nrows();

        assert!(ncols2 == dim3 && dim1 == nrows2, "Gemv: dimensions mismatch.");

        if ncols2 == 0 {
            return;
        }

        // FIXME: avoid bound checks.
        let col2 = a.column(0);
        let val  = unsafe { *x.vget_unchecked(0) };
        self.axpy(alpha * val, &col2, beta);

        for j in 1 .. ncols2 {
            let col2 = a.column(j);
            let val  = unsafe { *x.vget_unchecked(j) };

            self.axpy(alpha * val, &col2, N::one());
        }
    }

    /// Computes `self = alpha * a * x + beta * self`, where `a` is a **symmetric** matrix, `x` a
    /// vector, and `alpha, beta` two scalars.
    ///
    /// If `beta` is zero, `self` is never read. If `self` is read, only its lower-triangular part
    /// (including the diagonal) is actually read.
    #[inline]
    pub fn gemv_symm<D2: Dim, D3: Dim, SB, SC>(&mut self,
                                               alpha: N,
                                               a:     &Matrix<N, D2, D2, SB>,
                                               x:     &Vector<N, D3, SC>,
                                               beta:  N)
        where N:  One,
              SB: Storage<N, D2, D2>,
              SC: Storage<N, D3>,
              ShapeConstraint: DimEq<D, D2> +
                               AreMultipliable<D2, D2, D3, U1> {
        let dim1 = self.nrows();
        let dim2 = a.nrows();
        let dim3 = x.nrows();

        assert!(a.is_square(), "Syetric gemv: the input matrix must be square.");
        assert!(dim2 == dim3 && dim1 == dim2, "Symmetric gemv: dimensions mismatch.");

        if dim2 == 0 {
            return;
        }

        // FIXME: avoid bound checks.
        let col2 = a.column(0);
        let val  = unsafe { *x.vget_unchecked(0) };
        self.axpy(alpha * val, &col2, beta);
        self[0] += alpha * x.rows_range(1 ..).dot(&a.slice_range(1 .., 0));

        for j in 1 .. dim2 {
            let col2 = a.column(j);
            let dot  = x.rows_range(j ..).dot(&col2.rows_range(j ..));

            let val;
            unsafe {
                val = *x.vget_unchecked(j);
                *self.vget_unchecked_mut(j) += alpha * dot;
            }
            self.rows_range_mut(j + 1 ..).axpy(alpha * val, &col2.rows_range(j + 1 ..), N::one());
        }
    }
}

impl<N, R1: Dim, C1: Dim, S: StorageMut<N, R1, C1>> Matrix<N, R1, C1, S>
    where N: Scalar + Zero + ClosedAdd + ClosedMul {

    /// Computes `self = alpha * x * y.transpose() + beta * self`.
    ///
    /// If `beta` is zero, `self` is never read.
    #[inline]
    pub fn ger<D2: Dim, D3: Dim, SB, SC>(&mut self, alpha: N, x: &Vector<N, D2, SB>, y: &Vector<N, D3, SC>, beta: N)
        where N: One,
              SB: Storage<N, D2>,
              SC: Storage<N, D3>,
              ShapeConstraint: DimEq<R1, D2> + DimEq<C1, D3> {
        let (nrows1, ncols1) = self.shape();
        let dim2 = x.nrows();
        let dim3 = y.nrows();

        assert!(nrows1 == dim2 && ncols1 == dim3, "ger: dimensions mismatch.");

        for j in 0 .. ncols1 {
            // FIXME: avoid bound checks.
            let val = unsafe { *y.vget_unchecked(j) };
            self.column_mut(j).axpy(alpha * val, x, beta);
        }
    }

    /// Computes `self = alpha * a * b + beta * self`, where `a, b, self` are matrices.
    /// `alpha` and `beta` are scalar.
    ///
    /// If `beta` is zero, `self` is never read.
    #[inline]
    pub fn gemm<R2: Dim, C2: Dim, R3: Dim, C3: Dim, SB, SC>(&mut self,
                                                            alpha: N,
                                                            a: &Matrix<N, R2, C2, SB>,
                                                            b: &Matrix<N, R3, C3, SC>,
                                                            beta: N)
        where N:  One,
              SB: Storage<N, R2, C2>,
              SC: Storage<N, R3, C3>,
              ShapeConstraint: SameNumberOfRows<R1, R2>    +
                               SameNumberOfColumns<C1, C3> +
                               AreMultipliable<R2, C2, R3, C3> {
       let (nrows1, ncols1) = self.shape();
       let (nrows2, ncols2) = a.shape();
       let (nrows3, ncols3) = b.shape();

       assert_eq!(ncols2, nrows3, "gemm: dimensions mismatch for multiplication.");
       assert_eq!((nrows1, ncols1), (nrows2, ncols3), "gemm: dimensions mismatch for addition.");

       // We assume large matrices will be Dynamic but small matrices static.
       // We could use matrixmultiply for large statically-sized matrices but the performance
       // threshold to activate it would be different from SMALL_DIM because our code optimizes
       // better for statically-sized matrices.
       let is_dynamic = R1::is::<Dynamic>() || C1::is::<Dynamic>() ||
                        R2::is::<Dynamic>() || C2::is::<Dynamic>() ||
                        R3::is::<Dynamic>() || C3::is::<Dynamic>();
       // Thershold determined ampirically.
       const SMALL_DIM: usize = 5;

       if is_dynamic &&
          nrows1 > SMALL_DIM && ncols1 > SMALL_DIM &&
          nrows2 > SMALL_DIM && ncols2 > SMALL_DIM {
           if N::is::<f32>() {
               let (rsa, csa) = a.strides();
               let (rsb, csb) = b.strides();
               let (rsc, csc) = self.strides();

               unsafe {
                   matrixmultiply::sgemm(
                       nrows2,
                       ncols2,
                       ncols3,
                       mem::transmute_copy(&alpha),
                       a.data.ptr() as *const f32,
                       rsa as isize, csa as isize,
                       b.data.ptr() as *const f32,
                       rsb as isize, csb as isize,
                       mem::transmute_copy(&beta),
                       self.data.ptr_mut() as *mut f32,
                       rsc as isize, csc as isize);
               }
           }
           else if N::is::<f64>() {
               let (rsa, csa) = a.strides();
               let (rsb, csb) = b.strides();
               let (rsc, csc) = self.strides();

               unsafe {
                   matrixmultiply::dgemm(
                       nrows2,
                       ncols2,
                       ncols3,
                       mem::transmute_copy(&alpha),
                       a.data.ptr() as *const f64,
                       rsa as isize, csa as isize,
                       b.data.ptr() as *const f64,
                       rsb as isize, csb as isize,
                       mem::transmute_copy(&beta),
                       self.data.ptr_mut() as *mut f64,
                       rsc as isize, csc as isize);
               }
           }
       }
       else {
           for j1 in 0 .. ncols1 {
               // FIXME: avoid bound checks.
               self.column_mut(j1).gemv(alpha, a, &b.column(j1), beta);
           }
       }
    }
}


impl<N, R1: Dim, C1: Dim, S: StorageMut<N, R1, C1>> Matrix<N, R1, C1, S>
    where N: Scalar + Zero + ClosedAdd + ClosedMul {
    /// Computes `self = alpha * x * y.transpose() + beta * self`, where `self` is a **symmetric**
    /// matrix.
    ///
    /// If `beta` is zero, `self` is never read. The result is symmetric. Only the lower-triangular
    /// (including the diagonal) part of `self` is read/written.
    #[inline]
    pub fn ger_symm<D2: Dim, D3: Dim, SB, SC>(&mut self,
                                              alpha: N,
                                              x:     &Vector<N, D2, SB>,
                                              y:     &Vector<N, D3, SC>,
                                              beta:  N)
        where N: One,
              SB: Storage<N, D2>,
              SC: Storage<N, D3>,
              ShapeConstraint: DimEq<R1, D2> + DimEq<C1, D3> {
        let dim1 = self.nrows();
        let dim2 = x.nrows();
        let dim3 = y.nrows();

        assert!(self.is_square(), "Symmetric ger: the input matrix must be square.");
        assert!(dim1 == dim2 && dim1 == dim3, "ger: dimensions mismatch.");

        for j in 0 .. dim1 {
            // FIXME: avoid bound checks.
            let val = unsafe { *y.vget_unchecked(j) };
            let subdim = Dynamic::new(dim1 - j);
            self.generic_slice_mut((j, j), (subdim, U1)).axpy(alpha * val, &x.rows_range(j ..), beta);
        }
    }
}
Add the most common matrix decompositions. 2017-08-03 01:37:44 +08:00			`use std::mem;`
			`use num::{Zero, One, Signed};`
			`use matrixmultiply;`
			`use alga::general::{ClosedMul, ClosedAdd};`

			`use core::{Scalar, Matrix, Vector};`
			`use core::dimension::{Dim, U1, U2, U3, U4, Dynamic};`
			`use core::constraint::{ShapeConstraint, SameNumberOfRows, SameNumberOfColumns, AreMultipliable, DimEq};`
			`use core::storage::{Storage, StorageMut};`



			`impl<N: Scalar + PartialOrd + Signed, D: Dim, S: Storage<N, D>> Vector<N, D, S> {`
			`/// Computes the index of the vector component with the largest absolute value.`
			`#[inline]`
			`pub fn iamax(&self) -> usize {`
			`assert!(!self.is_empty(), "The input vector must not be empty.");`

			`let mut the_max = unsafe { self.vget_unchecked(0).abs() };`
			`let mut the_i = 0;`

			`for i in 1 .. self.nrows() {`
			`let val = unsafe { self.vget_unchecked(i).abs() };`

			`if val > the_max {`
			`the_max = val;`
			`the_i = i;`
			`}`
			`}`

			`the_i`
			`}`
			`}`

			`impl<N: Scalar + PartialOrd + Signed, R: Dim, C: Dim, S: Storage<N, R, C>> Matrix<N, R, C, S> {`
			`/// Computes the index of the matrix component with the largest absolute value.`
			`#[inline]`
			`pub fn iamax_full(&self) -> (usize, usize) {`
			`assert!(!self.is_empty(), "The input matrix must not be empty.");`

			`let mut the_max = unsafe { self.get_unchecked(0, 0).abs() };`
			`let mut the_ij = (0, 0);`

			`for j in 0 .. self.ncols() {`
			`for i in 0 .. self.nrows() {`
			`let val = unsafe { self.get_unchecked(i, j).abs() };`

			`if val > the_max {`
			`the_max = val;`
			`the_ij = (i, j);`
			`}`
			`}`
			`}`

			`the_ij`
			`}`
			`}`

			`impl<N, R: Dim, C: Dim, S: Storage<N, R, C>> Matrix<N, R, C, S>`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul {`
			`/// The dot product between two matrices (seen as vectors).`
			`///`
			`/// Note that this is not the matrix multiplication as in, e.g., numpy. For matrix`
			/// multiplication, use one of: `.gemm`, `mul_to`, `.mul`, `*`.
			`#[inline]`
			`pub fn dot<R2: Dim, C2: Dim, SB>(&self, rhs: &Matrix<N, R2, C2, SB>) -> N`
			`where SB: Storage<N, R2, C2>,`
			`ShapeConstraint: DimEq<R, R2> + DimEq<C, C2> {`
			`assert!(self.nrows() == rhs.nrows(), "Dot product dimensions mismatch.");`


			`// So we do some special cases for common fixed-size vectors of dimension lower than 8`
			// because the `for` loop bellow won't be very efficient on those.
			`if (R::is::<U2>() \|\| R2::is::<U2>()) &&`
			`(C::is::<U1>() \|\| C2::is::<U1>()) {`
			`unsafe {`
			`let a = self.get_unchecked(0, 0) *rhs.get_unchecked(0, 0);`
			`let b = self.get_unchecked(1, 0) *rhs.get_unchecked(1, 0);`

			`return a + b;`
			`}`
			`}`
			`if (R::is::<U3>() \|\| R2::is::<U3>()) &&`
			`(C::is::<U1>() \|\| C2::is::<U1>()) {`
			`unsafe {`
			`let a = self.get_unchecked(0, 0) *rhs.get_unchecked(0, 0);`
			`let b = self.get_unchecked(1, 0) *rhs.get_unchecked(1, 0);`
			`let c = self.get_unchecked(2, 0) *rhs.get_unchecked(2, 0);`

			`return a + b + c;`
			`}`
			`}`
			`if (R::is::<U4>() \|\| R2::is::<U4>()) &&`
			`(C::is::<U1>() \|\| C2::is::<U1>()) {`
			`unsafe {`
			`let mut a = self.get_unchecked(0, 0) *rhs.get_unchecked(0, 0);`
			`let mut b = self.get_unchecked(1, 0) *rhs.get_unchecked(1, 0);`
			`let c = self.get_unchecked(2, 0) *rhs.get_unchecked(2, 0);`
			`let d = self.get_unchecked(3, 0) *rhs.get_unchecked(3, 0);`

			`a += c;`
			`b += d;`

			`return a + b;`
			`}`
			`}`


			`// All this is inspired from the "unrolled version" discussed in:`
			`// http://blog.theincredibleholk.org/blog/2012/12/10/optimizing-dot-product/`
			`//`
			`// And this comment from bluss:`
			`// https://users.rust-lang.org/t/how-to-zip-two-slices-efficiently/2048/12`
			`let mut res = N::zero();`

			`// We have to define them outside of the loop (and not inside at first assignment)`
			`// otherwize vectorization won't kick in for some reason.`
			`let mut acc0;`
			`let mut acc1;`
			`let mut acc2;`
			`let mut acc3;`
			`let mut acc4;`
			`let mut acc5;`
			`let mut acc6;`
			`let mut acc7;`

			`for j in 0 .. self.ncols() {`
			`let mut i = 0;`

			`acc0 = N::zero();`
			`acc1 = N::zero();`
			`acc2 = N::zero();`
			`acc3 = N::zero();`
			`acc4 = N::zero();`
			`acc5 = N::zero();`
			`acc6 = N::zero();`
			`acc7 = N::zero();`

			`while self.nrows() - i >= 8 {`
			`acc0 += unsafe { self.get_unchecked(i + 0, j) *rhs.get_unchecked(i + 0, j) };`
			`acc1 += unsafe { self.get_unchecked(i + 1, j) *rhs.get_unchecked(i + 1, j) };`
			`acc2 += unsafe { self.get_unchecked(i + 2, j) *rhs.get_unchecked(i + 2, j) };`
			`acc3 += unsafe { self.get_unchecked(i + 3, j) *rhs.get_unchecked(i + 3, j) };`
			`acc4 += unsafe { self.get_unchecked(i + 4, j) *rhs.get_unchecked(i + 4, j) };`
			`acc5 += unsafe { self.get_unchecked(i + 5, j) *rhs.get_unchecked(i + 5, j) };`
			`acc6 += unsafe { self.get_unchecked(i + 6, j) *rhs.get_unchecked(i + 6, j) };`
			`acc7 += unsafe { self.get_unchecked(i + 7, j) *rhs.get_unchecked(i + 7, j) };`
			`i += 8;`
			`}`

			`res += acc0 + acc4;`
			`res += acc1 + acc5;`
			`res += acc2 + acc6;`
			`res += acc3 + acc7;`

			`for k in i .. self.nrows() {`
			`res += unsafe { self.get_unchecked(k, j) *rhs.get_unchecked(k, j) }`
			`}`
			`}`

			`res`
			`}`

			/// The dot product between the transpose of `self` and `rhs`.
			`#[inline]`
			`pub fn tr_dot<R2: Dim, C2: Dim, SB>(&self, rhs: &Matrix<N, R2, C2, SB>) -> N`
			`where SB: Storage<N, R2, C2>,`
			`ShapeConstraint: DimEq<C, R2> + DimEq<R, C2> {`
			`let (nrows, ncols) = self.shape();`
			`assert!((ncols, nrows) == rhs.shape(), "Transposed dot product dimension mismatch.");`

			`let mut res = N::zero();`

			`for j in 0 .. self.nrows() {`
			`for i in 0 .. self.ncols() {`
			`res += unsafe { self.get_unchecked(j, i) *rhs.get_unchecked(i, j) }`
			`}`
			`}`

			`res`
			`}`
			`}`

			`fn array_axpy<N>(y: &mut [N], a: N, x: &[N], beta: N, stride1: usize, stride2: usize, len: usize)`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul {`
			`for i in 0 .. len {`
			`unsafe {`
			`let y = y.get_unchecked_mut(i * stride1);`
			`y = a x.get_unchecked(i stride2) + beta * *y;`
			`}`
			`}`
			`}`

			`fn array_ax<N>(y: &mut [N], a: N, x: &[N], stride1: usize, stride2: usize, len: usize)`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul {`
			`for i in 0 .. len {`
			`unsafe {`
			`y.get_unchecked_mut(i stride1) = a * x.get_unchecked(i stride2);`
			`}`
			`}`
			`}`

			`impl<N, D: Dim, S> Vector<N, D, S>`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul,`
			`S: StorageMut<N, D> {`
			/// Computes `self = a * x + b * self`.
			`///`
			/// If be is zero, `self` is never read from.
			`#[inline]`
			`pub fn axpy<D2: Dim, SB>(&mut self, a: N, x: &Vector<N, D2, SB>, b: N)`
			`where SB: Storage<N, D2>,`
			`ShapeConstraint: DimEq<D, D2> {`

			`assert_eq!(self.nrows(), x.nrows(), "Axpy: mismatched vector shapes.");`

			`let rstride1 = self.strides().0;`
			`let rstride2 = x.strides().0;`

			`let y = self.data.as_mut_slice();`
			`let x = x.data.as_slice();`

			`if !b.is_zero() {`
			`array_axpy(y, a, x, b, rstride1, rstride2, x.len());`
			`}`
			`else {`
			`array_ax(y, a, x, rstride1, rstride2, x.len());`
			`}`
			`}`

			/// Computes `self = alpha * a * x + beta * self`, where `a` is a matrix, `x` a vector, and
			/// `alpha, beta` two scalars.
			`///`
			/// If `beta` is zero, `self` is never read.
			`#[inline]`
			`pub fn gemv<R2: Dim, C2: Dim, D3: Dim, SB, SC>(&mut self,`
			`alpha: N,`
			`a: &Matrix<N, R2, C2, SB>,`
			`x: &Vector<N, D3, SC>,`
			`beta: N)`
			`where N: One,`
			`SB: Storage<N, R2, C2>,`
			`SC: Storage<N, D3>,`
			`ShapeConstraint: DimEq<D, R2> +`
			`AreMultipliable<R2, C2, D3, U1> {`
			`let dim1 = self.nrows();`
			`let (nrows2, ncols2) = a.shape();`
			`let dim3 = x.nrows();`

			`assert!(ncols2 == dim3 && dim1 == nrows2, "Gemv: dimensions mismatch.");`

			`if ncols2 == 0 {`
			`return;`
			`}`

			`// FIXME: avoid bound checks.`
			`let col2 = a.column(0);`
			`let val = unsafe { *x.vget_unchecked(0) };`
			`self.axpy(alpha * val, &col2, beta);`

			`for j in 1 .. ncols2 {`
			`let col2 = a.column(j);`
			`let val = unsafe { *x.vget_unchecked(j) };`

			`self.axpy(alpha * val, &col2, N::one());`
			`}`
			`}`

			/// Computes `self = alpha * a * x + beta * self`, where `a` is a symmetric matrix, `x` a
			/// vector, and `alpha, beta` two scalars.
			`///`
			/// If `beta` is zero, `self` is never read. If `self` is read, only its lower-triangular part
			`/// (including the diagonal) is actually read.`
			`#[inline]`
			`pub fn gemv_symm<D2: Dim, D3: Dim, SB, SC>(&mut self,`
			`alpha: N,`
			`a: &Matrix<N, D2, D2, SB>,`
			`x: &Vector<N, D3, SC>,`
			`beta: N)`
			`where N: One,`
			`SB: Storage<N, D2, D2>,`
			`SC: Storage<N, D3>,`
			`ShapeConstraint: DimEq<D, D2> +`
			`AreMultipliable<D2, D2, D3, U1> {`
			`let dim1 = self.nrows();`
			`let dim2 = a.nrows();`
			`let dim3 = x.nrows();`

			`assert!(a.is_square(), "Syetric gemv: the input matrix must be square.");`
			`assert!(dim2 == dim3 && dim1 == dim2, "Symmetric gemv: dimensions mismatch.");`

			`if dim2 == 0 {`
			`return;`
			`}`

			`// FIXME: avoid bound checks.`
			`let col2 = a.column(0);`
			`let val = unsafe { *x.vget_unchecked(0) };`
			`self.axpy(alpha * val, &col2, beta);`
			`self[0] += alpha * x.rows_range(1 ..).dot(&a.slice_range(1 .., 0));`

			`for j in 1 .. dim2 {`
			`let col2 = a.column(j);`
			`let dot = x.rows_range(j ..).dot(&col2.rows_range(j ..));`

			`let val;`
			`unsafe {`
			`val = *x.vget_unchecked(j);`
			`self.vget_unchecked_mut(j) += alpha dot;`
			`}`
			`self.rows_range_mut(j + 1 ..).axpy(alpha * val, &col2.rows_range(j + 1 ..), N::one());`
			`}`
			`}`
			`}`

			`impl<N, R1: Dim, C1: Dim, S: StorageMut<N, R1, C1>> Matrix<N, R1, C1, S>`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul {`

			/// Computes `self = alpha * x * y.transpose() + beta * self`.
			`///`
			/// If `beta` is zero, `self` is never read.
			`#[inline]`
			`pub fn ger<D2: Dim, D3: Dim, SB, SC>(&mut self, alpha: N, x: &Vector<N, D2, SB>, y: &Vector<N, D3, SC>, beta: N)`
			`where N: One,`
			`SB: Storage<N, D2>,`
			`SC: Storage<N, D3>,`
			`ShapeConstraint: DimEq<R1, D2> + DimEq<C1, D3> {`
			`let (nrows1, ncols1) = self.shape();`
			`let dim2 = x.nrows();`
			`let dim3 = y.nrows();`

			`assert!(nrows1 == dim2 && ncols1 == dim3, "ger: dimensions mismatch.");`

			`for j in 0 .. ncols1 {`
			`// FIXME: avoid bound checks.`
			`let val = unsafe { *y.vget_unchecked(j) };`
			`self.column_mut(j).axpy(alpha * val, x, beta);`
			`}`
			`}`

			/// Computes `self = alpha * a * b + beta * self`, where `a, b, self` are matrices.
			/// `alpha` and `beta` are scalar.
			`///`
			/// If `beta` is zero, `self` is never read.
			`#[inline]`
			`pub fn gemm<R2: Dim, C2: Dim, R3: Dim, C3: Dim, SB, SC>(&mut self,`
			`alpha: N,`
			`a: &Matrix<N, R2, C2, SB>,`
			`b: &Matrix<N, R3, C3, SC>,`
			`beta: N)`
			`where N: One,`
			`SB: Storage<N, R2, C2>,`
			`SC: Storage<N, R3, C3>,`
			`ShapeConstraint: SameNumberOfRows<R1, R2> +`
			`SameNumberOfColumns<C1, C3> +`
			`AreMultipliable<R2, C2, R3, C3> {`
			`let (nrows1, ncols1) = self.shape();`
			`let (nrows2, ncols2) = a.shape();`
			`let (nrows3, ncols3) = b.shape();`

			`assert_eq!(ncols2, nrows3, "gemm: dimensions mismatch for multiplication.");`
			`assert_eq!((nrows1, ncols1), (nrows2, ncols3), "gemm: dimensions mismatch for addition.");`

			`// We assume large matrices will be Dynamic but small matrices static.`
			`// We could use matrixmultiply for large statically-sized matrices but the performance`
			`// threshold to activate it would be different from SMALL_DIM because our code optimizes`
			`// better for statically-sized matrices.`
			`let is_dynamic = R1::is::<Dynamic>() \|\| C1::is::<Dynamic>() \|\|`
			`R2::is::<Dynamic>() \|\| C2::is::<Dynamic>() \|\|`
			`R3::is::<Dynamic>() \|\| C3::is::<Dynamic>();`
			`// Thershold determined ampirically.`
			`const SMALL_DIM: usize = 5;`

			`if is_dynamic &&`
			`nrows1 > SMALL_DIM && ncols1 > SMALL_DIM &&`
			`nrows2 > SMALL_DIM && ncols2 > SMALL_DIM {`
			`if N::is::<f32>() {`
			`let (rsa, csa) = a.strides();`
			`let (rsb, csb) = b.strides();`
			`let (rsc, csc) = self.strides();`

			`unsafe {`
			`matrixmultiply::sgemm(`
			`nrows2,`
			`ncols2,`
			`ncols3,`
			`mem::transmute_copy(&alpha),`
			`a.data.ptr() as *const f32,`
			`rsa as isize, csa as isize,`
			`b.data.ptr() as *const f32,`
			`rsb as isize, csb as isize,`
			`mem::transmute_copy(&beta),`
			`self.data.ptr_mut() as *mut f32,`
			`rsc as isize, csc as isize);`
			`}`
			`}`
			`else if N::is::<f64>() {`
			`let (rsa, csa) = a.strides();`
			`let (rsb, csb) = b.strides();`
			`let (rsc, csc) = self.strides();`

			`unsafe {`
			`matrixmultiply::dgemm(`
			`nrows2,`
			`ncols2,`
			`ncols3,`
			`mem::transmute_copy(&alpha),`
			`a.data.ptr() as *const f64,`
			`rsa as isize, csa as isize,`
			`b.data.ptr() as *const f64,`
			`rsb as isize, csb as isize,`
			`mem::transmute_copy(&beta),`
			`self.data.ptr_mut() as *mut f64,`
			`rsc as isize, csc as isize);`
			`}`
			`}`
			`}`
			`else {`
			`for j1 in 0 .. ncols1 {`
			`// FIXME: avoid bound checks.`
			`self.column_mut(j1).gemv(alpha, a, &b.column(j1), beta);`
			`}`
			`}`
			`}`
			`}`


			`impl<N, R1: Dim, C1: Dim, S: StorageMut<N, R1, C1>> Matrix<N, R1, C1, S>`
			`where N: Scalar + Zero + ClosedAdd + ClosedMul {`
			/// Computes `self = alpha * x * y.transpose() + beta * self`, where `self` is a symmetric
			`/// matrix.`
			`///`
			/// If `beta` is zero, `self` is never read. The result is symmetric. Only the lower-triangular
			/// (including the diagonal) part of `self` is read/written.
			`#[inline]`
			`pub fn ger_symm<D2: Dim, D3: Dim, SB, SC>(&mut self,`
			`alpha: N,`
			`x: &Vector<N, D2, SB>,`
			`y: &Vector<N, D3, SC>,`
			`beta: N)`
			`where N: One,`
			`SB: Storage<N, D2>,`
			`SC: Storage<N, D3>,`
			`ShapeConstraint: DimEq<R1, D2> + DimEq<C1, D3> {`
			`let dim1 = self.nrows();`
			`let dim2 = x.nrows();`
			`let dim3 = y.nrows();`

			`assert!(self.is_square(), "Symmetric ger: the input matrix must be square.");`
			`assert!(dim1 == dim2 && dim1 == dim3, "ger: dimensions mismatch.");`

			`for j in 0 .. dim1 {`
			`// FIXME: avoid bound checks.`
			`let val = unsafe { *y.vget_unchecked(j) };`
			`let subdim = Dynamic::new(dim1 - j);`
			`self.generic_slice_mut((j, j), (subdim, U1)).axpy(alpha * val, &x.rows_range(j ..), beta);`
			`}`
			`}`
			`}`