diff --git a/nalgebra-sparse/src/ops/serial/cs.rs b/nalgebra-sparse/src/ops/serial/cs.rs index a05e4fdf..f3a14924 100644 --- a/nalgebra-sparse/src/ops/serial/cs.rs +++ b/nalgebra-sparse/src/ops/serial/cs.rs @@ -1,5 +1,3 @@ -//use std::collections::HashSet; - use crate::cs::CsMatrix; use crate::ops::serial::{OperationError, OperationErrorKind}; use crate::ops::Op; @@ -7,12 +5,12 @@ use crate::SparseEntryMut; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar}; use num_traits::{One, Zero}; -//fn spmm_cs_unexpected_entry() -> OperationError { -// OperationError::from_kind_and_message( -// OperationErrorKind::InvalidPattern, -// String::from("Found unexpected entry that is not present in `c`."), -// ) -//} +fn spmm_cs_unexpected_entry() -> OperationError { + OperationError::from_kind_and_message( + OperationErrorKind::InvalidPattern, + String::from("Found unexpected entry that is not present in `c`."), + ) +} /// Helper functionality for implementing CSR/CSC SPMM. /// @@ -22,7 +20,7 @@ use num_traits::{One, Zero}; /// reversed (since transpose(AB) = transpose(B) * transpose(A) and CSC(A) = transpose(CSR(A)). /// /// We assume here that the matrices have already been verified to be dimensionally compatible. -pub fn spmm_cs_prealloc( +pub fn spmm_cs_prealloc_unchecked( beta: T, c: &mut CsMatrix, alpha: T, @@ -43,8 +41,10 @@ where let b_lane_k = b.get_lane(k).unwrap(); let alpha_aik = alpha.clone() * a_ik.clone(); for (j, b_kj) in b_lane_k.minor_indices().iter().zip(b_lane_k.values()) { - // Determine the location in C to append the value - scratchpad_values[*j] += alpha_aik.clone() * b_kj.clone(); + // use a dense scatter vector to accumulate non-zeros quickly + unsafe { + *scratchpad_values.get_unchecked_mut(*j) += alpha_aik.clone() * b_kj.clone(); + } } } @@ -53,15 +53,55 @@ where values .iter_mut() .zip(indices) - .for_each(|(output_ref, index)| { - *output_ref = beta.clone() * output_ref.clone() + scratchpad_values[*index].clone(); - scratchpad_values[*index] = Zero::zero(); + .for_each(|(output_ref, index)| unsafe { + *output_ref = beta.clone() * output_ref.clone() + + scratchpad_values.get_unchecked(*index).clone(); + *scratchpad_values.get_unchecked_mut(*index) = Zero::zero(); }); } Ok(()) } +pub fn spmm_cs_prealloc_checked( + beta: T, + c: &mut CsMatrix, + alpha: T, + a: &CsMatrix, + b: &CsMatrix, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + for i in 0..c.pattern().major_dim() { + let a_lane_i = a.get_lane(i).unwrap(); + let mut c_lane_i = c.get_lane_mut(i).unwrap(); + for c_ij in c_lane_i.values_mut() { + *c_ij = beta.clone() * c_ij.clone(); + } + + for (&k, a_ik) in a_lane_i.minor_indices().iter().zip(a_lane_i.values()) { + let b_lane_k = b.get_lane(k).unwrap(); + let (mut c_lane_i_cols, mut c_lane_i_values) = c_lane_i.indices_and_values_mut(); + let alpha_aik = alpha.clone() * a_ik.clone(); + for (j, b_kj) in b_lane_k.minor_indices().iter().zip(b_lane_k.values()) { + // Determine the location in C to append the value + let (c_local_idx, _) = c_lane_i_cols + .iter() + .enumerate() + .find(|(_, c_col)| *c_col == j) + .ok_or_else(spmm_cs_unexpected_entry)?; + + c_lane_i_values[c_local_idx] += alpha_aik.clone() * b_kj.clone(); + c_lane_i_cols = &c_lane_i_cols[c_local_idx..]; + c_lane_i_values = &mut c_lane_i_values[c_local_idx..]; + } + } + } + + Ok(()) +} + fn spadd_cs_unexpected_entry() -> OperationError { OperationError::from_kind_and_message( OperationErrorKind::InvalidPattern, diff --git a/nalgebra-sparse/src/ops/serial/csc.rs b/nalgebra-sparse/src/ops/serial/csc.rs index e5c9ae4e..42d27079 100644 --- a/nalgebra-sparse/src/ops/serial/csc.rs +++ b/nalgebra-sparse/src/ops/serial/csc.rs @@ -1,5 +1,7 @@ use crate::csc::CscMatrix; -use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; +use crate::ops::serial::cs::{ + spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc_checked, spmm_cs_prealloc_unchecked, +}; use crate::ops::serial::{OperationError, OperationErrorKind}; use crate::ops::Op; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, RealField, Scalar}; @@ -71,7 +73,7 @@ where /// # Panics /// /// Panics if the dimensions of the matrices involved are not compatible with the expression. -pub fn spmm_csc_prealloc( +pub fn spmm_csc_prealloc_checked( beta: T, c: &mut CscMatrix, alpha: T, @@ -83,35 +85,81 @@ where { assert_compatible_spmm_dims!(c, a, b); - use Op::{NoOp, Transpose}; + use Op::NoOp; match (&a, &b) { (NoOp(ref a), NoOp(ref b)) => { // Note: We have to reverse the order for CSC matrices - spmm_cs_prealloc(beta, &mut c.cs, alpha, &b.cs, &a.cs) - } - _ => { - // Currently we handle transposition by explicitly precomputing transposed matrices - // and calling the operation again without transposition - let a_ref: &CscMatrix = a.inner_ref(); - let b_ref: &CscMatrix = b.inner_ref(); - let (a, b) = { - use Cow::*; - match (&a, &b) { - (NoOp(_), NoOp(_)) => unreachable!(), - (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), - (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), - (Transpose(ref a), Transpose(ref b)) => { - (Owned(a.transpose()), Owned(b.transpose())) - } - } - }; - - spmm_csc_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) + spmm_cs_prealloc_checked(beta, &mut c.cs, alpha, &b.cs, &a.cs) } + _ => do_transposes(beta, c, alpha, a, b, spmm_csc_prealloc_checked), } } +/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`. +/// This will not return an error even if the patterns don't match. +/// Should be used for situations where pattern creation immediately preceeds multiplication. +/// +/// Panics if the dimensions of the matrices involved are not compatible with the expression. +pub(crate) fn spmm_csc_prealloc_unchecked( + beta: T, + c: &mut CscMatrix, + alpha: T, + a: Op<&CscMatrix>, + b: Op<&CscMatrix>, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + assert_compatible_spmm_dims!(c, a, b); + + use Op::NoOp; + + match (&a, &b) { + (NoOp(ref a), NoOp(ref b)) => { + // Note: We have to reverse the order for CSC matrices + spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &b.cs, &a.cs) + } + _ => do_transposes(beta, c, alpha, a, b, spmm_csc_prealloc_unchecked), + } +} + +fn do_transposes( + beta: T, + c: &mut CscMatrix, + alpha: T, + a: Op<&CscMatrix>, + b: Op<&CscMatrix>, + caller: F, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, + F: Fn( + T, + &mut CscMatrix, + T, + Op<&CscMatrix>, + Op<&CscMatrix>, + ) -> Result<(), OperationError>, +{ + use Op::{NoOp, Transpose}; + + // Currently we handle transposition by explicitly precomputing transposed matrices + // and calling the operation again without transposition + let a_ref: &CscMatrix = a.inner_ref(); + let b_ref: &CscMatrix = b.inner_ref(); + let (a, b) = { + use Cow::*; + match (&a, &b) { + (NoOp(_), NoOp(_)) => unreachable!(), + (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), + (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), + (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())), + } + }; + caller(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) +} + /// Solve the lower triangular system `op(L) X = B`. /// /// Only the lower triangular part of L is read, and the result is stored in B. diff --git a/nalgebra-sparse/src/ops/serial/csr.rs b/nalgebra-sparse/src/ops/serial/csr.rs index fa317bbf..967c02f2 100644 --- a/nalgebra-sparse/src/ops/serial/csr.rs +++ b/nalgebra-sparse/src/ops/serial/csr.rs @@ -1,5 +1,7 @@ use crate::csr::CsrMatrix; -use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; +use crate::ops::serial::cs::{ + spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc_checked, spmm_cs_prealloc_unchecked, +}; use crate::ops::serial::OperationError; use crate::ops::Op; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar}; @@ -65,7 +67,7 @@ where /// # Panics /// /// Panics if the dimensions of the matrices involved are not compatible with the expression. -pub fn spmm_csr_prealloc( +pub fn spmm_csr_prealloc_checked( beta: T, c: &mut CsrMatrix, alpha: T, @@ -77,30 +79,75 @@ where { assert_compatible_spmm_dims!(c, a, b); - use Op::{NoOp, Transpose}; + use Op::NoOp; match (&a, &b) { - (NoOp(ref a), NoOp(ref b)) => spmm_cs_prealloc(beta, &mut c.cs, alpha, &a.cs, &b.cs), - _ => { - // Currently we handle transposition by explicitly precomputing transposed matrices - // and calling the operation again without transposition - // TODO: At least use workspaces to allow control of allocations. Maybe - // consider implementing certain patterns (like A^T * B) explicitly - let a_ref: &CsrMatrix = a.inner_ref(); - let b_ref: &CsrMatrix = b.inner_ref(); - let (a, b) = { - use Cow::*; - match (&a, &b) { - (NoOp(_), NoOp(_)) => unreachable!(), - (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), - (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), - (Transpose(ref a), Transpose(ref b)) => { - (Owned(a.transpose()), Owned(b.transpose())) - } - } - }; - - spmm_csr_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) + (NoOp(ref a), NoOp(ref b)) => { + spmm_cs_prealloc_checked(beta, &mut c.cs, alpha, &a.cs, &b.cs) } + _ => do_transposes(beta, c, alpha, a, b, spmm_csr_prealloc_checked), } } + +/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`. +/// This will not return an error even if the patterns don't match. +/// Should be used for situations where pattern creation immediately preceeds multiplication. +/// +/// Panics if the dimensions of the matrices involved are not compatible with the expression. +pub(crate) fn spmm_csr_prealloc_unchecked( + beta: T, + c: &mut CsrMatrix, + alpha: T, + a: Op<&CsrMatrix>, + b: Op<&CsrMatrix>, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + assert_compatible_spmm_dims!(c, a, b); + + use Op::NoOp; + + match (&a, &b) { + (NoOp(ref a), NoOp(ref b)) => { + spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &a.cs, &b.cs) + } + _ => do_transposes(beta, c, alpha, a, b, spmm_csr_prealloc_unchecked), + } +} + +fn do_transposes( + beta: T, + c: &mut CsrMatrix, + alpha: T, + a: Op<&CsrMatrix>, + b: Op<&CsrMatrix>, + caller: F, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, + F: Fn( + T, + &mut CsrMatrix, + T, + Op<&CsrMatrix>, + Op<&CsrMatrix>, + ) -> Result<(), OperationError>, +{ + use Op::{NoOp, Transpose}; + + // Currently we handle transposition by explicitly precomputing transposed matrices + // and calling the operation again without transposition + let a_ref: &CsrMatrix = a.inner_ref(); + let b_ref: &CsrMatrix = b.inner_ref(); + let (a, b) = { + use Cow::*; + match (&a, &b) { + (NoOp(_), NoOp(_)) => unreachable!(), + (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), + (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), + (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())), + } + }; + caller(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) +}