diff --git a/nalgebra-sparse/src/ops/impl_std_ops.rs b/nalgebra-sparse/src/ops/impl_std_ops.rs index 107c38ba..91b6574f 100644 --- a/nalgebra-sparse/src/ops/impl_std_ops.rs +++ b/nalgebra-sparse/src/ops/impl_std_ops.rs @@ -3,7 +3,7 @@ use crate::csr::CsrMatrix; use crate::ops::serial::{ spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_pattern, - spmm_csc_prealloc, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc, + spmm_csc_prealloc_unchecked, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc_unchecked, }; use crate::ops::Op; use nalgebra::allocator::Allocator; @@ -112,9 +112,9 @@ macro_rules! impl_spmm { } } -impl_spmm!(CsrMatrix, spmm_csr_pattern, spmm_csr_prealloc); +impl_spmm!(CsrMatrix, spmm_csr_pattern, spmm_csr_prealloc_unchecked); // Need to switch order of operations for CSC pattern -impl_spmm!(CscMatrix, spmm_csc_pattern, spmm_csc_prealloc); +impl_spmm!(CscMatrix, spmm_csc_pattern, spmm_csc_prealloc_unchecked); /// Implements Scalar * Matrix operations for *concrete* scalar types. The reason this is necessary /// is that we are not able to implement Mul> for all T generically due to orphan rules. diff --git a/nalgebra-sparse/src/ops/serial/cs.rs b/nalgebra-sparse/src/ops/serial/cs.rs index 86484053..cc13c168 100644 --- a/nalgebra-sparse/src/ops/serial/cs.rs +++ b/nalgebra-sparse/src/ops/serial/cs.rs @@ -20,6 +20,51 @@ fn spmm_cs_unexpected_entry() -> OperationError { /// reversed (since transpose(AB) = transpose(B) * transpose(A) and CSC(A) = transpose(CSR(A)). /// /// We assume here that the matrices have already been verified to be dimensionally compatible. +pub fn spmm_cs_prealloc_unchecked( + beta: T, + c: &mut CsMatrix, + alpha: T, + a: &CsMatrix, + b: &CsMatrix, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + assert_eq!(c.pattern().major_dim(), a.pattern().major_dim()); + assert_eq!(c.pattern().minor_dim(), b.pattern().minor_dim()); + let some_val = Zero::zero(); + let mut scratchpad_values: Vec = vec![some_val; b.pattern().minor_dim()]; + for i in 0..c.pattern().major_dim() { + let a_lane_i = a.get_lane(i).unwrap(); + + let mut c_lane_i = c.get_lane_mut(i).unwrap(); + + for (&k, a_ik) in a_lane_i.minor_indices().iter().zip(a_lane_i.values()) { + let b_lane_k = b.get_lane(k).unwrap(); + let alpha_aik = alpha.clone() * a_ik.clone(); + for (j, b_kj) in b_lane_k.minor_indices().iter().zip(b_lane_k.values()) { + // use a dense scatter vector to accumulate non-zeros quickly + unsafe { + *scratchpad_values.get_unchecked_mut(*j) += alpha_aik.clone() * b_kj.clone(); + } + } + } + + //Get indices from C pattern and gather from the dense scratchpad_values + let (indices, values) = c_lane_i.indices_and_values_mut(); + values + .iter_mut() + .zip(indices) + .for_each(|(output_ref, index)| unsafe { + *output_ref = beta.clone() * output_ref.clone() + + scratchpad_values.get_unchecked(*index).clone(); + *scratchpad_values.get_unchecked_mut(*index) = Zero::zero(); + }); + } + + Ok(()) +} + pub fn spmm_cs_prealloc( beta: T, c: &mut CsMatrix, diff --git a/nalgebra-sparse/src/ops/serial/csc.rs b/nalgebra-sparse/src/ops/serial/csc.rs index e5c9ae4e..6a691ef9 100644 --- a/nalgebra-sparse/src/ops/serial/csc.rs +++ b/nalgebra-sparse/src/ops/serial/csc.rs @@ -1,5 +1,7 @@ use crate::csc::CscMatrix; -use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; +use crate::ops::serial::cs::{ + spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc, spmm_cs_prealloc_unchecked, +}; use crate::ops::serial::{OperationError, OperationErrorKind}; use crate::ops::Op; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, RealField, Scalar}; @@ -83,35 +85,81 @@ where { assert_compatible_spmm_dims!(c, a, b); - use Op::{NoOp, Transpose}; + use Op::NoOp; match (&a, &b) { (NoOp(ref a), NoOp(ref b)) => { // Note: We have to reverse the order for CSC matrices spmm_cs_prealloc(beta, &mut c.cs, alpha, &b.cs, &a.cs) } - _ => { - // Currently we handle transposition by explicitly precomputing transposed matrices - // and calling the operation again without transposition - let a_ref: &CscMatrix = a.inner_ref(); - let b_ref: &CscMatrix = b.inner_ref(); - let (a, b) = { - use Cow::*; - match (&a, &b) { - (NoOp(_), NoOp(_)) => unreachable!(), - (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), - (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), - (Transpose(ref a), Transpose(ref b)) => { - (Owned(a.transpose()), Owned(b.transpose())) - } - } - }; - - spmm_csc_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) - } + _ => spmm_csc_transposed(beta, c, alpha, a, b, spmm_csc_prealloc), } } +/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`. +/// This will not return an error even if the patterns don't match. +/// Should be used for situations where pattern creation immediately preceeds multiplication. +/// +/// Panics if the dimensions of the matrices involved are not compatible with the expression. +pub fn spmm_csc_prealloc_unchecked( + beta: T, + c: &mut CscMatrix, + alpha: T, + a: Op<&CscMatrix>, + b: Op<&CscMatrix>, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + assert_compatible_spmm_dims!(c, a, b); + + use Op::NoOp; + + match (&a, &b) { + (NoOp(ref a), NoOp(ref b)) => { + // Note: We have to reverse the order for CSC matrices + spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &b.cs, &a.cs) + } + _ => spmm_csc_transposed(beta, c, alpha, a, b, spmm_csc_prealloc_unchecked), + } +} + +fn spmm_csc_transposed( + beta: T, + c: &mut CscMatrix, + alpha: T, + a: Op<&CscMatrix>, + b: Op<&CscMatrix>, + spmm_kernel: F, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, + F: Fn( + T, + &mut CscMatrix, + T, + Op<&CscMatrix>, + Op<&CscMatrix>, + ) -> Result<(), OperationError>, +{ + use Op::{NoOp, Transpose}; + + // Currently we handle transposition by explicitly precomputing transposed matrices + // and calling the operation again without transposition + let a_ref: &CscMatrix = a.inner_ref(); + let b_ref: &CscMatrix = b.inner_ref(); + let (a, b) = { + use Cow::*; + match (&a, &b) { + (NoOp(_), NoOp(_)) => unreachable!(), + (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), + (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), + (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())), + } + }; + spmm_kernel(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) +} + /// Solve the lower triangular system `op(L) X = B`. /// /// Only the lower triangular part of L is read, and the result is stored in B. diff --git a/nalgebra-sparse/src/ops/serial/csr.rs b/nalgebra-sparse/src/ops/serial/csr.rs index fa317bbf..708c81a3 100644 --- a/nalgebra-sparse/src/ops/serial/csr.rs +++ b/nalgebra-sparse/src/ops/serial/csr.rs @@ -1,5 +1,7 @@ use crate::csr::CsrMatrix; -use crate::ops::serial::cs::{spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc}; +use crate::ops::serial::cs::{ + spadd_cs_prealloc, spmm_cs_dense, spmm_cs_prealloc, spmm_cs_prealloc_unchecked, +}; use crate::ops::serial::OperationError; use crate::ops::Op; use nalgebra::{ClosedAdd, ClosedMul, DMatrixSlice, DMatrixSliceMut, Scalar}; @@ -77,30 +79,73 @@ where { assert_compatible_spmm_dims!(c, a, b); - use Op::{NoOp, Transpose}; + use Op::NoOp; match (&a, &b) { (NoOp(ref a), NoOp(ref b)) => spmm_cs_prealloc(beta, &mut c.cs, alpha, &a.cs, &b.cs), - _ => { - // Currently we handle transposition by explicitly precomputing transposed matrices - // and calling the operation again without transposition - // TODO: At least use workspaces to allow control of allocations. Maybe - // consider implementing certain patterns (like A^T * B) explicitly - let a_ref: &CsrMatrix = a.inner_ref(); - let b_ref: &CsrMatrix = b.inner_ref(); - let (a, b) = { - use Cow::*; - match (&a, &b) { - (NoOp(_), NoOp(_)) => unreachable!(), - (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), - (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), - (Transpose(ref a), Transpose(ref b)) => { - (Owned(a.transpose()), Owned(b.transpose())) - } - } - }; - - spmm_csr_prealloc(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) - } + _ => spmm_csr_transposed(beta, c, alpha, a, b, spmm_csr_prealloc), } } + +/// Faster sparse-sparse matrix multiplication, `C <- beta * C + alpha * op(A) * op(B)`. +/// This will not return an error even if the patterns don't match. +/// Should be used for situations where pattern creation immediately preceeds multiplication. +/// +/// Panics if the dimensions of the matrices involved are not compatible with the expression. +pub fn spmm_csr_prealloc_unchecked( + beta: T, + c: &mut CsrMatrix, + alpha: T, + a: Op<&CsrMatrix>, + b: Op<&CsrMatrix>, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, +{ + assert_compatible_spmm_dims!(c, a, b); + + use Op::NoOp; + + match (&a, &b) { + (NoOp(ref a), NoOp(ref b)) => { + spmm_cs_prealloc_unchecked(beta, &mut c.cs, alpha, &a.cs, &b.cs) + } + _ => spmm_csr_transposed(beta, c, alpha, a, b, spmm_csr_prealloc_unchecked), + } +} + +fn spmm_csr_transposed( + beta: T, + c: &mut CsrMatrix, + alpha: T, + a: Op<&CsrMatrix>, + b: Op<&CsrMatrix>, + spmm_kernel: F, +) -> Result<(), OperationError> +where + T: Scalar + ClosedAdd + ClosedMul + Zero + One, + F: Fn( + T, + &mut CsrMatrix, + T, + Op<&CsrMatrix>, + Op<&CsrMatrix>, + ) -> Result<(), OperationError>, +{ + use Op::{NoOp, Transpose}; + + // Currently we handle transposition by explicitly precomputing transposed matrices + // and calling the operation again without transposition + let a_ref: &CsrMatrix = a.inner_ref(); + let b_ref: &CsrMatrix = b.inner_ref(); + let (a, b) = { + use Cow::*; + match (&a, &b) { + (NoOp(_), NoOp(_)) => unreachable!(), + (Transpose(ref a), NoOp(_)) => (Owned(a.transpose()), Borrowed(b_ref)), + (NoOp(_), Transpose(ref b)) => (Borrowed(a_ref), Owned(b.transpose())), + (Transpose(ref a), Transpose(ref b)) => (Owned(a.transpose()), Owned(b.transpose())), + } + }; + spmm_kernel(beta, c, alpha, NoOp(a.as_ref()), NoOp(b.as_ref())) +} diff --git a/nalgebra-sparse/tests/unit_tests/ops.rs b/nalgebra-sparse/tests/unit_tests/ops.rs index f2a02fd8..0a335567 100644 --- a/nalgebra-sparse/tests/unit_tests/ops.rs +++ b/nalgebra-sparse/tests/unit_tests/ops.rs @@ -6,7 +6,8 @@ use nalgebra_sparse::csc::CscMatrix; use nalgebra_sparse::csr::CsrMatrix; use nalgebra_sparse::ops::serial::{ spadd_csc_prealloc, spadd_csr_prealloc, spadd_pattern, spmm_csc_dense, spmm_csc_prealloc, - spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc, spsolve_csc_lower_triangular, + spmm_csc_prealloc_unchecked, spmm_csr_dense, spmm_csr_pattern, spmm_csr_prealloc, + spmm_csr_prealloc_unchecked, spsolve_csc_lower_triangular, }; use nalgebra_sparse::ops::Op; use nalgebra_sparse::pattern::SparsityPattern; @@ -543,6 +544,29 @@ proptest! { prop_assert_eq!(&c_pattern, c_csr.pattern()); } + #[test] + fn spmm_csr_prealloc_unchecked_test(SpmmCsrArgs { c, beta, alpha, a, b } + in spmm_csr_prealloc_args_strategy() + ) { + // Test that we get the expected result by comparing to an equivalent dense operation + // (here we give in the C matrix, so the sparsity pattern is essentially fixed) + let mut c_sparse = c.clone(); + spmm_csr_prealloc_unchecked(beta, &mut c_sparse, alpha, a.as_ref(), b.as_ref()).unwrap(); + + let mut c_dense = DMatrix::from(&c); + let op_a_dense = match a { + Op::NoOp(ref a) => DMatrix::from(a), + Op::Transpose(ref a) => DMatrix::from(a).transpose(), + }; + let op_b_dense = match b { + Op::NoOp(ref b) => DMatrix::from(b), + Op::Transpose(ref b) => DMatrix::from(b).transpose(), + }; + c_dense = beta * c_dense + alpha * &op_a_dense * op_b_dense; + + prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense); + } + #[test] fn spmm_csr_prealloc_test(SpmmCsrArgs { c, beta, alpha, a, b } in spmm_csr_prealloc_args_strategy() @@ -705,6 +729,29 @@ proptest! { prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense); } + #[test] + fn spmm_csc_prealloc_unchecked_test(SpmmCscArgs { c, beta, alpha, a, b } + in spmm_csc_prealloc_args_strategy() + ) { + // Test that we get the expected result by comparing to an equivalent dense operation + // (here we give in the C matrix, so the sparsity pattern is essentially fixed) + let mut c_sparse = c.clone(); + spmm_csc_prealloc_unchecked(beta, &mut c_sparse, alpha, a.as_ref(), b.as_ref()).unwrap(); + + let mut c_dense = DMatrix::from(&c); + let op_a_dense = match a { + Op::NoOp(ref a) => DMatrix::from(a), + Op::Transpose(ref a) => DMatrix::from(a).transpose(), + }; + let op_b_dense = match b { + Op::NoOp(ref b) => DMatrix::from(b), + Op::Transpose(ref b) => DMatrix::from(b).transpose(), + }; + c_dense = beta * c_dense + alpha * &op_a_dense * op_b_dense; + + prop_assert_eq!(&DMatrix::from(&c_sparse), &c_dense); + } + #[test] fn spmm_csc_prealloc_panics_on_dim_mismatch( (alpha, beta, c, a, b)